diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2021-10-31 02:28:48 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2021-10-31 02:28:48 -0400 |
commit | 4e3386843325299df13069a1c94e27237b12be51 (patch) | |
tree | b1cf4c009b01eee0d017e3c01acc7a7495adcc46 /arch | |
parent | e59f3e5d4521cb95233e03ece48772e9161cbfd3 (diff) | |
parent | 5a2acbbb0179a7ffbb5440b9fa46689f619705ac (diff) |
Merge tag 'kvmarm-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm64 updates for Linux 5.16
- More progress on the protected VM front, now with the full
fixed feature set as well as the limitation of some hypercalls
after initialisation.
- Cleanup of the RAZ/WI sysreg handling, which was pointlessly
complicated
- Fixes for the vgic placement in the IPA space, together with a
bunch of selftests
- More memcg accounting of the memory allocated on behalf of a guest
- Timer and vgic selftests
- Workarounds for the Apple M1 broken vgic implementation
- KConfig cleanups
- New kvmarm.mode=none option, for those who really dislike us
Diffstat (limited to 'arch')
92 files changed, 1953 insertions, 783 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c7ae4c3954b..0d921d21fd35 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -186,6 +186,7 @@ config ARM64 select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_KVM select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/arm64/boot/dts/qcom/ipq8074.dtsi b/arch/arm64/boot/dts/qcom/ipq8074.dtsi index a620ac0d0b19..db333001df4d 100644 --- a/arch/arm64/boot/dts/qcom/ipq8074.dtsi +++ b/arch/arm64/boot/dts/qcom/ipq8074.dtsi @@ -487,7 +487,6 @@ interrupts = <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>; phys = <&qusb_phy_0>, <&usb0_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; @@ -528,7 +527,6 @@ interrupts = <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; phys = <&qusb_phy_1>, <&usb1_ssphy>; phy-names = "usb2-phy", "usb3-phy"; - tx-fifo-resize; snps,is-utmi-l1-suspend; snps,hird-threshold = /bits/ 8 <0x0>; snps,dis_u2_susphy_quirk; diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index 7535dc7cc5aa..bd68e1b7f29f 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -50,9 +50,6 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size); -#define acpi_os_memmap acpi_os_memmap - typedef u64 phys_cpuid_t; #define PHYS_CPUID_INVALID INVALID_HWID diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 89faca0e740d..bfa58409a4d4 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -525,6 +525,11 @@ alternative_endif #define EXPORT_SYMBOL_NOKASAN(name) EXPORT_SYMBOL(name) #endif +#ifdef CONFIG_KASAN_HW_TAGS +#define EXPORT_SYMBOL_NOHWKASAN(name) +#else +#define EXPORT_SYMBOL_NOHWKASAN(name) EXPORT_SYMBOL_NOKASAN(name) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 327120c0089f..a39fcf318c77 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -295,6 +295,7 @@ #define MDCR_EL2_HPMFZO (UL(1) << 29) #define MDCR_EL2_MTPME (UL(1) << 28) #define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HLP (UL(1) << 26) #define MDCR_EL2_HCCD (UL(1) << 23) #define MDCR_EL2_TTRF (UL(1) << 19) #define MDCR_EL2_HPMD (UL(1) << 17) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e86045ac43ba..0167f3702c61 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -44,31 +44,39 @@ #define KVM_HOST_SMCCC_FUNC(name) KVM_HOST_SMCCC_ID(__KVM_HOST_SMCCC_FUNC_##name) #define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init 0 -#define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run 1 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context 2 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3 -#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4 -#define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context 5 -#define __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff 6 -#define __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs 7 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config 8 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr 9 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr 10 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs 11 -#define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 -#define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 -#define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 -#define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 -#define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ #include <linux/mm.h> +enum __kvm_host_smccc_func { + /* Hypercalls available only prior to pKVM finalisation */ + /* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */ + __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, + __KVM_HOST_SMCCC_FUNC___pkvm_init, + __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping, + __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector, + __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_init_lrs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_get_gic_config, + __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize, + + /* Hypercalls available after pKVM finalisation */ + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, + __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, + __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, + __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, + __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, + __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, + __KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr, + __KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr, + __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs, + __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, +}; + #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] #define DECLARE_KVM_NVHE_SYM(sym) extern char kvm_nvhe_sym(sym)[] diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index fd418955e31e..f4871e47b2d0 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -396,7 +396,10 @@ static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) if (vcpu_mode_is_32bit(vcpu)) return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT); - return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & (1 << 25)); + if (vcpu_mode_priv(vcpu)) + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE); + else + return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E); } static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 369c30e28301..4be8486042a7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -58,6 +58,7 @@ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, + KVM_MODE_NONE, }; enum kvm_mode kvm_get_mode(void); @@ -779,6 +780,8 @@ static inline bool kvm_vm_is_protected(struct kvm *kvm) return false; } +void kvm_init_protected_traps(struct kvm_vcpu *vcpu); + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 657d0c94cf82..5afd14ab15b9 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -115,7 +115,12 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); #endif +extern u64 kvm_nvhe_sym(id_aa64pfr0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64pfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar0_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64isar1_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); +extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val); #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 3f93b9e0b339..02511650cffe 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -99,11 +99,17 @@ void mte_check_tfsr_el1(void); static inline void mte_check_tfsr_entry(void) { + if (!system_supports_mte()) + return; + mte_check_tfsr_el1(); } static inline void mte_check_tfsr_exit(void) { + if (!system_supports_mte()) + return; + /* * The asynchronous faults are sync'ed automatically with * TFSR_EL1 on kernel entry but for exit an explicit dsb() diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3a3264ff47b9..95f7686b728d 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -12,11 +12,13 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#ifndef CONFIG_KASAN_HW_TAGS #define __HAVE_ARCH_STRCMP extern int strcmp(const char *, const char *); #define __HAVE_ARCH_STRNCMP extern int strncmp(const char *, const char *, __kernel_size_t); +#endif #define __HAVE_ARCH_STRLEN extern __kernel_size_t strlen(const char *); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b268082d67ed..9412a645a1c0 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1152,6 +1152,7 @@ #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) +#define ICH_HCR_TDIR (1 << 14) #define ICH_HCR_EOIcount_SHIFT 27 #define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) @@ -1184,6 +1185,8 @@ #define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ICH_VTR_TDS_SHIFT 19 +#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) #define ARM64_FEATURE_FIELD_BITS 4 diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 1c9c2f7a1c04..f3851724fe35 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -273,8 +273,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) return __pgprot(PROT_DEVICE_nGnRnE); } -static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, - acpi_size size, bool memory) +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { efi_memory_desc_t *md, *region = NULL; pgprot_t prot; @@ -300,11 +299,9 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, * It is fine for AML to remap regions that are not represented in the * EFI memory map at all, as it only describes normal memory, and MMIO * regions that require a virtual mapping to make them accessible to - * the EFI runtime services. Determine the region default - * attributes by checking the requested memory semantics. + * the EFI runtime services. */ - prot = memory ? __pgprot(PROT_NORMAL_NC) : - __pgprot(PROT_DEVICE_nGnRnE); + prot = __pgprot(PROT_DEVICE_nGnRnE); if (region) { switch (region->type) { case EFI_LOADER_CODE: @@ -364,16 +361,6 @@ static void __iomem *__acpi_os_ioremap(acpi_physical_address phys, return __ioremap(phys, size, prot); } -void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, false); -} - -void __iomem *acpi_os_memmap(acpi_physical_address phys, acpi_size size) -{ - return __acpi_os_ioremap(phys, size, true); -} - /* * Claim Synchronous External Aborts as a firmware first notification. * diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f8a3067d10c6..6ec7036ef7e1 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1526,9 +1526,13 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, /* * For reasons that aren't entirely clear, enabling KPTI on Cavium * ThunderX leads to apparent I-cache corruption of kernel text, which - * ends as well as you might imagine. Don't even try. + * ends as well as you might imagine. Don't even try. We cannot rely + * on the cpus_have_*cap() helpers here to detect the CPU erratum + * because cpucap detection order may change. However, since we know + * affected CPUs are always in a homogeneous configuration, it is + * safe to rely on this_cpu_has_cap() here. */ - if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) { str = "ARM64_WORKAROUND_CAVIUM_27456"; __kpti_forced = -1; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 9d314a3bad3b..e5e801bc5312 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -142,12 +142,7 @@ void mte_enable_kernel_async(void) #ifdef CONFIG_KASAN_HW_TAGS void mte_check_tfsr_el1(void) { - u64 tfsr_el1; - - if (!system_supports_mte()) - return; - - tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + u64 tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { /* @@ -199,6 +194,9 @@ void mte_thread_init_user(void) void mte_thread_switch(struct task_struct *next) { + if (!system_supports_mte()) + return; + mte_update_sctlr_user(next); /* diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 6f6ff072acbd..44369b99a57e 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -1128,5 +1128,6 @@ bool cpus_are_stuck_in_kernel(void) { bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die()); - return !!cpus_stuck_in_kernel || smp_spin_tables; + return !!cpus_stuck_in_kernel || smp_spin_tables || + is_protected_kvm_enabled(); } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index d7eec0b43744..8ffcbe29395e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -4,6 +4,7 @@ # source "virt/lib/Kconfig" +source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" @@ -19,7 +20,7 @@ if VIRTUALIZATION menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" - depends on OF + depends on HAVE_KVM select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT @@ -43,12 +44,9 @@ menuconfig KVM If unsure, say N. -if KVM - -source "virt/kvm/Kconfig" - config NVHE_EL2_DEBUG bool "Debug mode for non-VHE EL2 object" + depends on KVM help Say Y here to enable the debug mode for the non-VHE KVM EL2 object. Failure reports will BUG() in the hypervisor. This is intended for @@ -56,6 +54,4 @@ config NVHE_EL2_DEBUG If unsure, say N. -endif # KVM - endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7838e9fb693e..f5490afe1ebf 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -291,10 +291,12 @@ long kvm_arch_dev_ioctl(struct file *filp, struct kvm *kvm_arch_alloc_vm(void) { + size_t sz = sizeof(struct kvm); + if (!has_vhe()) - return kzalloc(sizeof(struct kvm), GFP_KERNEL); + return kzalloc(sz, GFP_KERNEL_ACCOUNT); - return vzalloc(sizeof(struct kvm)); + return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) @@ -612,6 +614,14 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) ret = kvm_arm_pmu_v3_enable(vcpu); + /* + * Initialize traps for protected VMs. + * NOTE: Move to run in EL2 directly, rather than via a hypercall, once + * the code is in place for first run initialization at EL2. + */ + if (kvm_vm_is_protected(kvm)) + kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); + return ret; } @@ -1571,25 +1581,33 @@ static void cpu_set_hyp_vector(void) kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); } -static void cpu_hyp_reinit(void) +static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); - cpu_hyp_reset(); - - if (is_kernel_in_hyp_mode()) - kvm_timer_init_vhe(); - else + if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); +} +static void cpu_hyp_init_features(void) +{ cpu_set_hyp_vector(); - kvm_arm_init_debug(); + if (is_kernel_in_hyp_mode()) + kvm_timer_init_vhe(); + if (vgic_present) kvm_vgic_init_cpu_hardware(); } +static void cpu_hyp_reinit(void) +{ + cpu_hyp_reset(); + cpu_hyp_init_context(); + cpu_hyp_init_features(); +} + static void _kvm_arch_hardware_enable(void *discard) { if (!__this_cpu_read(kvm_arm_hardware_enabled)) { @@ -1780,10 +1798,17 @@ static int do_pkvm_init(u32 hyp_va_bits) int ret; preempt_disable(); - hyp_install_host_vector(); + cpu_hyp_init_context(); ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, num_possible_cpus(), kern_hyp_va(per_cpu_base), hyp_va_bits); + cpu_hyp_init_features(); + + /* + * The stub hypercalls are now disabled, so set our local flag to + * prevent a later re-init attempt in kvm_arch_hardware_enable(). + */ + __this_cpu_write(kvm_arm_hardware_enabled, 1); preempt_enable(); return ret; @@ -1794,8 +1819,13 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits) void *addr = phys_to_virt(hyp_mem_base); int ret; + kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); + kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); + kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) @@ -1963,9 +1993,25 @@ out_err: return err; } -static void _kvm_host_prot_finalize(void *discard) +static void _kvm_host_prot_finalize(void *arg) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); + int *err = arg; + + if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize))) + WRITE_ONCE(*err, -EINVAL); +} + +static int pkvm_drop_host_privileges(void) +{ + int ret = 0; + + /* + * Flip the static key upfront as that may no longer be possible + * once the host stage 2 is installed. + */ + static_branch_enable(&kvm_protected_mode_initialized); + on_each_cpu(_kvm_host_prot_finalize, &ret, 1); + return ret; } static int finalize_hyp_mode(void) @@ -1979,15 +2025,7 @@ static int finalize_hyp_mode(void) * None of other sections should ever be introspected. */ kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); - - /* - * Flip the static key upfront as that may no longer be possible - * once the host stage 2 is installed. - */ - static_branch_enable(&kvm_protected_mode_initialized); - on_each_cpu(_kvm_host_prot_finalize, NULL, 1); - - return 0; + return pkvm_drop_host_privileges(); } struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) @@ -2056,6 +2094,11 @@ int kvm_arch_init(void *opaque) return -ENODEV; } + if (kvm_get_mode() == KVM_MODE_NONE) { + kvm_info("KVM disabled from command line\n"); + return -ENODEV; + } + in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || @@ -2129,8 +2172,15 @@ static int __init early_kvm_mode_cfg(char *arg) return 0; } - if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) + if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { + kvm_mode = KVM_MODE_DEFAULT; + return 0; + } + + if (strcmp(arg, "none") == 0) { + kvm_mode = KVM_MODE_NONE; return 0; + } return -EINVAL; } diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h new file mode 100644 index 000000000000..1b8a2dcd712f --- /dev/null +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 - ARM Ltd + * Author: Marc Zyngier <marc.zyngier@arm.com> + */ + +#ifndef __ARM64_KVM_HYP_FAULT_H__ +#define __ARM64_KVM_HYP_FAULT_H__ + +#include <asm/kvm_asm.h> +#include <asm/kvm_emulate.h> +#include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> + +static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) +{ + u64 par, tmp; + + /* + * Resolve the IPA the hard way using the guest VA. + * + * Stage-1 translation already validated the memory access + * rights. As such, we can use the EL1 translation regime, and + * don't have to distinguish between EL0 and EL1 access. + * + * We do need to save/restore PAR_EL1 though, as we haven't + * saved the guest context yet, and we may return early... + */ + par = read_sysreg_par(); + if (!__kvm_at("s1e1r", far)) + tmp = read_sysreg_par(); + else + tmp = SYS_PAR_EL1_F; /* back to the guest */ + write_sysreg(par, par_el1); + + if (unlikely(tmp & SYS_PAR_EL1_F)) + return false; /* Translation failed, back to guest */ + + /* Convert PAR to HPFAR format */ + *hpfar = PAR_TO_HPFAR(tmp); + return true; +} + +static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) +{ + u64 hpfar, far; + + far = read_sysreg_el2(SYS_FAR); + + /* + * The HPFAR can be invalid if the stage 2 fault did not + * happen during a stage 1 page table walk (the ESR_EL2.S1PTW + * bit is clear) and one of the two following cases are true: + * 1. The fault was due to a permission fault + * 2. The processor carries errata 834220 + * + * Therefore, for all non S1PTW faults where we either have a + * permission fault or the errata workaround is enabled, we + * resolve the IPA using the AT instruction. + */ + if (!(esr & ESR_ELx_S1PTW) && + (cpus_have_final_cap(ARM64_WORKAROUND_834220) || + (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + if (!__translate_far_to_hpfar(far, &hpfar)) + return false; + } else { + hpfar = read_sysreg(hpfar_el2); + } + + fault->far_el2 = far; + fault->hpfar_el2 = hpfar; + return true; +} + +#endif diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index a0e78a6027be..c6e98c7e918b 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -8,6 +8,7 @@ #define __ARM64_KVM_HYP_SWITCH_H__ #include <hyp/adjust_pc.h> +#include <hyp/fault.h> #include <linux/arm-smccc.h> #include <linux/kvm_host.h> @@ -133,78 +134,9 @@ static inline void ___deactivate_traps(struct kvm_vcpu *vcpu) } } -static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar) -{ - u64 par, tmp; - - /* - * Resolve the IPA the hard way using the guest VA. - * - * Stage-1 translation already validated the memory access - * rights. As such, we can use the EL1 translation regime, and - * don't have to distinguish between EL0 and EL1 access. - * - * We do need to save/restore PAR_EL1 though, as we haven't - * saved the guest context yet, and we may return early... - */ - par = read_sysreg_par(); - if (!__kvm_at("s1e1r", far)) - tmp = read_sysreg_par(); - else - tmp = SYS_PAR_EL1_F; /* back to the guest */ - write_sysreg(par, par_el1); - - if (unlikely(tmp & SYS_PAR_EL1_F)) - return false; /* Translation failed, back to guest */ - - /* Convert PAR to HPFAR format */ - *hpfar = PAR_TO_HPFAR(tmp); - return true; -} - -static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) -{ - u64 hpfar, far; - - far = read_sysreg_el2(SYS_FAR); - - /* - * The HPFAR can be invalid if the stage 2 fault did not - * happen during a stage 1 page table walk (the ESR_EL2.S1PTW - * bit is clear) and one of the two following cases are true: - * 1. The fault was due to a permission fault - * 2. The processor carries errata 834220 - * - * Therefore, for all non S1PTW faults where we either have a - * permission fault or the errata workaround is enabled, we - * resolve the IPA using the AT instruction. - */ - if (!(esr & ESR_ELx_S1PTW) && - (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { - if (!__translate_far_to_hpfar(far, &hpfar)) - return false; - } else { - hpfar = read_sysreg(hpfar_el2); - } - - fault->far_el2 = far; - fault->hpfar_el2 = hpfar; - return true; -} - static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) { - u8 ec; - u64 esr; - - esr = vcpu->arch.fault.esr_el2; - ec = ESR_ELx_EC(esr); - - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) - return true; - - return __get_fault_info(esr, &vcpu->arch.fault); + return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault); } static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) @@ -225,8 +157,13 @@ static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); } -/* Check for an FPSIMD/SVE trap and handle as appropriate */ -static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) +/* + * We trap the first access to the FP/SIMD to save the host context and + * restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an undefined + * instruction exception to the guest. Similarly for trapped SVE accesses. + */ +static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) { bool sve_guest, sve_host; u8 esr_ec; @@ -244,9 +181,6 @@ static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) } esr_ec = kvm_vcpu_trap_get_class(vcpu); - if (esr_ec != ESR_ELx_EC_FP_ASIMD && - esr_ec != ESR_ELx_EC_SVE) - return false; /* Don't handle SVE traps for non-SVE vcpus here: */ if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) @@ -348,14 +282,6 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) static inline bool esr_is_ptrauth_trap(u32 esr) { - u32 ec = ESR_ELx_EC(esr); - - if (ec == ESR_ELx_EC_PAC) - return true; - - if (ec != ESR_ELx_EC_SYS64) - return false; - switch (esr_sys64_to_sysreg(esr)) { case SYS_APIAKEYLO_EL1: case SYS_APIAKEYHI_EL1: @@ -384,13 +310,12 @@ static inline bool esr_is_ptrauth_trap(u32 esr) DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); -static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) +static bool kvm_hyp_handle_ptrauth(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm_cpu_context *ctxt; u64 val; - if (!vcpu_has_ptrauth(vcpu) || - !esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + if (!vcpu_has_ptrauth(vcpu)) return false; ctxt = this_cpu_ptr(&kvm_hyp_ctxt); @@ -409,6 +334,90 @@ static inline bool __hyp_handle_ptrauth(struct kvm_vcpu *vcpu) return true; } +static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && + handle_tx2_tvm(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + if (esr_is_ptrauth_trap(kvm_vcpu_get_esr(vcpu))) + return kvm_hyp_handle_ptrauth(vcpu, exit_code); + + return false; +} + +static bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + __vgic_v3_perform_cpuif_access(vcpu) == 1) + return true; + + return false; +} + +static bool kvm_hyp_handle_iabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + return false; +} + +static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + if (!__populate_fault_info(vcpu)) + return true; + + if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { + bool valid; + + valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + kvm_vcpu_dabt_isvalid(vcpu) && + !kvm_vcpu_abt_issea(vcpu) && + !kvm_vcpu_abt_iss1tw(vcpu); + + if (valid) { + int ret = __vgic_v2_perform_cpuif_access(vcpu); + + if (ret == 1) + return true; + + /* Promote an illegal access to an SError.*/ + if (ret == -1) + *exit_code = ARM_EXCEPTION_EL1_SERROR; + } + } + + return false; +} + +typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); + +/* + * Allow the hypervisor to handle the exit with an exit handler if it has one. + * + * Returns true if the hypervisor handled the exit, and control should go back + * to the guest, or false if it hasn't. + */ +static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const exit_handler_fn *handlers = kvm_get_exit_handler_array(vcpu); + exit_handler_fn fn; + + fn = handlers[kvm_vcpu_trap_get_class(vcpu)]; + + if (fn) + return fn(vcpu, exit_code); + + return false; +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -443,59 +452,9 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (*exit_code != ARM_EXCEPTION_TRAP) goto exit; - if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 && - handle_tx2_tvm(vcpu)) + /* Check if there's an exit handler and allow it to handle the exit. */ + if (kvm_hyp_handle_exit(vcpu, exit_code)) goto guest; - - /* - * We trap the first access to the FP/SIMD to save the host context - * and restore the guest context lazily. - * If FP/SIMD is not implemented, handle the trap and inject an - * undefined instruction exception to the guest. - * Similarly for trapped SVE accesses. - */ - if (__hyp_handle_fpsimd(vcpu)) - goto guest; - - if (__hyp_handle_ptrauth(vcpu)) - goto guest; - - if (!__populate_fault_info(vcpu)) - goto guest; - - if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { - bool valid; - - valid = kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_DABT_LOW && - kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && - kvm_vcpu_dabt_isvalid(vcpu) && - !kvm_vcpu_abt_issea(vcpu) && - !kvm_vcpu_abt_iss1tw(vcpu); - - if (valid) { - int ret = __vgic_v2_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - - /* Promote an illegal access to an SError.*/ - if (ret == -1) - *exit_code = ARM_EXCEPTION_EL1_SERROR; - - goto exit; - } - } - - if (static_branch_unlikely(&vgic_v3_cpuif_trap) && - (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || - kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { - int ret = __vgic_v3_perform_cpuif_access(vcpu); - - if (ret == 1) - goto guest; - } - exit: /* Return to the host kernel and handle the exit */ return false; diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h new file mode 100644 index 000000000000..eea1f6a53723 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#ifndef __ARM64_KVM_FIXED_CONFIG_H__ +#define __ARM64_KVM_FIXED_CONFIG_H__ + +#include <asm/sysreg.h> + +/* + * This file contains definitions for features to be allowed or restricted for + * guest virtual machines, depending on the mode KVM is running in and on the + * type of guest that is running. + * + * The ALLOW masks represent a bitmask of feature fields that are allowed + * without any restrictions as long as they are supported by the system. + * + * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for + * features that are restricted to support at most the specified feature. + * + * If a feature field is not present in either, than it is not supported. + * + * The approach taken for protected VMs is to allow features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +/* + * Allow for protected VMs: + * - Floating-point and Advanced SIMD + * - Data Independent Timing + */ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_DIT) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - AArch64 guests only (no support for AArch32 guests): + * AArch32 adds complexity in trap handling, emulation, condition codes, + * etc... + * - RAS (v1) + * Supported by KVM + */ +#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL2), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL3), ID_AA64PFR0_ELx_64BIT_ONLY) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), ID_AA64PFR0_RAS_V1) \ + ) + +/* + * Allow for protected VMs: + * - Branch Target Identification + * - Speculative Store Bypassing + */ +#define PVM_ID_AA64PFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR1_BT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR1_SSBS) \ + ) + +/* + * Allow for protected VMs: + * - Mixed-endian + * - Distinction between Secure and Non-secure Memory + * - Mixed-endian at EL0 only + * - Non-context synchronizing exception entry and exit + */ +#define PVM_ID_AA64MMFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_SNSMEM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_BIGENDEL0) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EXS) \ + ) + +/* + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID + */ +#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_PARANGE), ID_AA64MMFR0_PARANGE_40) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_ASID), ID_AA64MMFR0_ASID_16) \ + ) + +/* + * Allow for protected VMs: + * - Hardware translation table updates to Access flag and Dirty state + * - Number of VMID bits from CPU + * - Hierarchical Permission Disables + * - Privileged Access Never + * - SError interrupt exceptions from speculative reads + * - Enhanced Translation Synchronization + */ +#define PVM_ID_AA64MMFR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HADBS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_VMIDBITS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_HPD) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_PAN) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_SPECSEI) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR1_ETS) \ + ) + +/* + * Allow for protected VMs: + * - Common not Private translations + * - User Access Override + * - IESB bit in the SCTLR_ELx registers + * - Unaligned single-copy atomicity and atomic functions + * - ESR_ELx.EC value on an exception by read access to feature ID space + * - TTL field in address operations. + * - Break-before-make sequences when changing translation block size + * - E0PDx mechanism + */ +#define PVM_ID_AA64MMFR2_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64MMFR2_CNP) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_UAO) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IESB) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_AT) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_IDS) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_TTL) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_BBM) | \ + ARM64_FEATURE_MASK(ID_AA64MMFR2_E0PD) \ + ) + +/* + * No support for Scalable Vectors for protected VMs: + * Requires additional support from KVM, e.g., context-switching and + * trapping at EL2 + */ +#define PVM_ID_AA64ZFR0_ALLOW (0ULL) + +/* + * No support for debug, including breakpoints, and watchpoints for protected + * VMs: + * The Arm architecture mandates support for at least the Armv8 debug + * architecture, which would include at least 2 hardware breakpoints and + * watchpoints. Providing that support to protected guests adds + * considerable state and complexity. Therefore, the reserved value of 0 is + * used for debug-related fields. + */ +#define PVM_ID_AA64DFR0_ALLOW (0ULL) +#define PVM_ID_AA64DFR1_ALLOW (0ULL) + +/* + * No support for implementation defined features. + */ +#define PVM_ID_AA64AFR0_ALLOW (0ULL) +#define PVM_ID_AA64AFR1_ALLOW (0ULL) + +/* + * No restrictions on instructions implemented in AArch64. + */ +#define PVM_ID_AA64ISAR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR0_AES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA1) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA2) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_CRC32) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_ATOMICS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RDM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SHA3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM3) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_SM4) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_DP) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_FHM) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_TLB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR0_RNDR) \ + ) + +#define PVM_ID_AA64ISAR1_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DPB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_JSCVT) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FCMA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_LRCPC) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_FRINTTS) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SB) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_SPECRES) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_BF16) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_DGH) | \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_I8MM) \ + ) + +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +int kvm_check_pvm_sysreg_table(void); + +#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 1e6d995968a1..45a84f0ade04 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,4 +15,6 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); + #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 8d741f71377f..c3c11974fa3b 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o obj-y += $(lib-objs) diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 4b652ffb591d..0c6116d34e18 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -110,17 +110,14 @@ SYM_FUNC_START(__hyp_do_panic) b __host_enter_for_panic SYM_FUNC_END(__hyp_do_panic) -.macro host_el1_sync_vect - .align 7 -.L__vect_start\@: - stp x0, x1, [sp, #-16]! - mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT - cmp x0, #ESR_ELx_EC_HVC64 - b.ne __host_exit - +SYM_FUNC_START(__host_hvc) ldp x0, x1, [sp] // Don't fixup the stack yet + /* No stub for you, sonny Jim */ +alternative_if ARM64_KVM_PROTECTED_MODE + b __host_exit +alternative_else_nop_endif + /* Check for a stub HVC call */ cmp x0, #HVC_STUB_HCALL_NR b.hs __host_exit @@ -137,6 +134,17 @@ SYM_FUNC_END(__hyp_do_panic) ldr x5, =__kvm_handle_stub_hvc hyp_pa x5, x6 br x5 +SYM_FUNC_END(__host_hvc) + +.macro host_el1_sync_vect + .align 7 +.L__vect_start\@: + stp x0, x1, [sp, #-16]! + mrs x0, esr_el2 + lsr x0, x0, #ESR_ELx_EC_SHIFT + cmp x0, #ESR_ELx_EC_HVC64 + b.eq __host_hvc + b __host_exit .L__vect_end\@: .if ((.L__vect_end\@ - .L__vect_start\@) > 0x80) .error "host_el1_sync_vect larger than vector entry" diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 2da6aa8da868..b096bf009144 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -4,7 +4,7 @@ * Author: Andrew Scull <ascull@google.com> */ -#include <hyp/switch.h> +#include <hyp/adjust_pc.h> #include <asm/pgtable-types.h> #include <asm/kvm_asm.h> @@ -160,41 +160,65 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } + +static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); +} + typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x static const hcall_t host_hcall[] = { - HANDLE_FUNC(__kvm_vcpu_run), + /* ___kvm_hyp_init */ + HANDLE_FUNC(__kvm_get_mdcr_el2), + HANDLE_FUNC(__pkvm_init), + HANDLE_FUNC(__pkvm_create_private_mapping), + HANDLE_FUNC(__pkvm_cpu_set_vector), + HANDLE_FUNC(__kvm_enable_ssbs), + HANDLE_FUNC(__vgic_v3_init_lrs), + HANDLE_FUNC(__vgic_v3_get_gic_config), + HANDLE_FUNC(__pkvm_prot_finalize), + + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__kvm_adjust_pc), + HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), - HANDLE_FUNC(__kvm_enable_ssbs), - HANDLE_FUNC(__vgic_v3_get_gic_config), HANDLE_FUNC(__vgic_v3_read_vmcr), HANDLE_FUNC(__vgic_v3_write_vmcr), - HANDLE_FUNC(__vgic_v3_init_lrs), - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__vgic_v3_save_aprs), HANDLE_FUNC(__vgic_v3_restore_aprs), - HANDLE_FUNC(__pkvm_init), - HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_host_share_hyp), - HANDLE_FUNC(__pkvm_create_private_mapping), - HANDLE_FUNC(__pkvm_prot_finalize), + HANDLE_FUNC(__pkvm_vcpu_init_traps), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(unsigned long, id, host_ctxt, 0); + unsigned long hcall_min = 0; hcall_t hfn; + /* + * If pKVM has been initialised then reject any calls to the + * early "privileged" hypercalls. Note that we cannot reject + * calls to __pkvm_prot_finalize for two reasons: (1) The static + * key used to determine initialisation must be toggled prior to + * finalisation and (2) finalisation is performed on a per-CPU + * basis. This is all fine, however, since __pkvm_prot_finalize + * returns -EPERM after the first call for a given CPU. + */ + if (static_branch_unlikely(&kvm_protected_mode_initialized)) + hcall_min = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize; + id -= KVM_HOST_SMCCC_ID(0); - if (unlikely(id >= ARRAY_SIZE(host_hcall))) + if (unlikely(id < hcall_min || id >= ARRAY_SIZE(host_hcall))) goto inval; hfn = host_hcall[id]; diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 34eeb524b686..c1a90dd022b8 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -11,7 +11,7 @@ #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> -#include <hyp/switch.h> +#include <hyp/fault.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> @@ -25,12 +25,6 @@ struct host_kvm host_kvm; static struct hyp_pool host_s2_pool; -/* - * Copies of the host's CPU features registers holding sanitized values. - */ -u64 id_aa64mmfr0_el1_sys_val; -u64 id_aa64mmfr1_el1_sys_val; - const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) @@ -134,6 +128,9 @@ int __pkvm_prot_finalize(void) struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + if (params->hcr_el2 & HCR_VM) + return -EPERM; + params->vttbr = kvm_get_vttbr(mmu); params->vtcr = host_kvm.arch.vtcr; params->hcr_el2 |= HCR_VM; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c new file mode 100644 index 000000000000..99c8d8b73e70 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/kvm_host.h> +#include <linux/mm.h> +#include <nvhe/fixed_config.h> +#include <nvhe/trap_handler.h> + +/* + * Set trap register values based on features in ID_AA64PFR0. + */ +static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + u64 hcr_set = HCR_RW; + u64 hcr_clear = 0; + u64 cptr_set = 0; + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL0), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_ELx_64BIT_ONLY); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_RAS), feature_ids) < + ID_AA64PFR0_RAS_V1P1) { + hcr_set |= HCR_TERR | HCR_TEA; + hcr_clear |= HCR_FIEN; + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_AMU), feature_ids)) { + hcr_clear |= HCR_AMVOFFEN; + cptr_set |= CPTR_EL2_TAM; + } + + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), feature_ids)) + cptr_set |= CPTR_EL2_TZ; + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64PFR1. + */ +static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + u64 hcr_set = 0; + u64 hcr_clear = 0; + + /* Memory Tagging: Trap and Treat as Untagged if not supported. */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), feature_ids)) { + hcr_set |= HCR_TID5; + hcr_clear |= HCR_DCT | HCR_ATA; + } + + vcpu->arch.hcr_el2 |= hcr_set; + vcpu->arch.hcr_el2 &= ~hcr_clear; +} + +/* + * Set trap register values based on features in ID_AA64DFR0. + */ +static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 mdcr_set = 0; + u64 mdcr_clear = 0; + u64 cptr_set = 0; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | + MDCR_EL2_HPMN_MASK; + } + + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), feature_ids)) + mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; + + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_DOUBLELOCK), feature_ids)) + mdcr_set |= MDCR_EL2_TDOSA; + + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER), feature_ids)) { + mdcr_set |= MDCR_EL2_TPMS; + mdcr_clear |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; + } + + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACE_FILT), feature_ids)) + mdcr_set |= MDCR_EL2_TTRF; + + /* Trap Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_TRACEVER), feature_ids)) + cptr_set |= CPTR_EL2_TTA; + + vcpu->arch.mdcr_el2 |= mdcr_set; + vcpu->arch.mdcr_el2 &= ~mdcr_clear; + vcpu->arch.cptr_el2 |= cptr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR0. + */ +static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 mdcr_set = 0; + + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_FGT), feature_ids)) + mdcr_set |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 |= mdcr_set; +} + +/* + * Set trap register values based on features in ID_AA64MMFR1. + */ +static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) +{ + const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 hcr_set = 0; + + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_LOR), feature_ids)) + hcr_set |= HCR_TLOR; + + vcpu->arch.hcr_el2 |= hcr_set; +} + +/* + * Set baseline trap register values. + */ +static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) +{ + const u64 hcr_trap_feat_regs = HCR_TID3; + const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; + + /* Clear res0 and set res1 bits to trap potential new features. */ + vcpu->arch.hcr_el2 &= ~(HCR_RES0); + vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); +} + +/* + * Initialize trap register values for protected VMs. + */ +void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +{ + pvm_init_trap_regs(vcpu); + pvm_init_traps_aa64pfr0(vcpu); + pvm_init_traps_aa64pfr1(vcpu); + pvm_init_traps_aa64dfr0(vcpu); + pvm_init_traps_aa64mmfr0(vcpu); + pvm_init_traps_aa64mmfr1(vcpu); +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 57c27846320f..862c7b514e20 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -10,6 +10,7 @@ #include <asm/kvm_pgtable.h> #include <nvhe/early_alloc.h> +#include <nvhe/fixed_config.h> #include <nvhe/gfp.h> #include <nvhe/memory.h> #include <nvhe/mem_protect.h> @@ -260,6 +261,8 @@ int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); int ret; + BUG_ON(kvm_check_pvm_sysreg_table()); + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) return -EINVAL; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index a34b01cc8ab9..c0e3fed26d93 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -27,6 +27,7 @@ #include <asm/processor.h> #include <asm/thread_info.h> +#include <nvhe/fixed_config.h> #include <nvhe/mem_protect.h> /* Non-VHE specific context */ @@ -158,6 +159,101 @@ static void __pmu_switch_to_host(struct kvm_cpu_context *host_ctxt) write_sysreg(pmu->events_host, pmcntenset_el0); } +/** + * Handler for protected VM MSR, MRS or System instruction execution in AArch64. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_sys64(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Make sure we handle the exit for workarounds and ptrauth + * before the pKVM handling, as the latter could decide to + * UNDEF. + */ + return (kvm_hyp_handle_sysreg(vcpu, exit_code) || + kvm_handle_pvm_sysreg(vcpu, exit_code)); +} + +/** + * Handler for protected floating-point and Advanced SIMD accesses. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't. + */ +static bool kvm_handle_pvm_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* Linux guests assume support for floating-point and Advanced SIMD. */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_ASIMD), + PVM_ID_AA64PFR0_ALLOW)); + + return kvm_hyp_handle_fpsimd(vcpu, exit_code); +} + +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn pvm_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_SYS64] = kvm_handle_pvm_sys64, + [ESR_ELx_EC_SVE] = kvm_handle_pvm_restricted, + [ESR_ELx_EC_FP_ASIMD] = kvm_handle_pvm_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + if (unlikely(kvm_vm_is_protected(kern_hyp_va(vcpu->kvm)))) + return pvm_exit_handlers; + + return hyp_exit_handlers; +} + +/* + * Some guests (e.g., protected VMs) are not be allowed to run in AArch32. + * The ARMv8 architecture does not give the hypervisor a mechanism to prevent a + * guest from dropping to AArch32 EL0 if implemented by the CPU. If the + * hypervisor spots a guest in such a state ensure it is handled, and don't + * trust the host to spot or fix it. The check below is based on the one in + * kvm_arch_vcpu_ioctl_run(). + * + * Returns false if the guest ran in AArch32 when it shouldn't have, and + * thus should exit to the host, or true if a the guest run loop can continue. + */ +static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + struct kvm *kvm = kern_hyp_va(vcpu->kvm); + + if (kvm_vm_is_protected(kvm) && vcpu_mode_is_32bit(vcpu)) { + /* + * As we have caught the guest red-handed, decide that it isn't + * fit for purpose anymore by making the vcpu invalid. The VMM + * can try and fix it by re-initializing the vcpu with + * KVM_ARM_VCPU_INIT, however, this is likely not possible for + * protected VMs. + */ + vcpu->arch.target = -1; + *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); + *exit_code |= ARM_EXCEPTION_IL; + return false; + } + + return true; +} + /* Switch to the guest for legacy non-VHE systems */ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { @@ -220,6 +316,9 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* Jump in the fire! */ exit_code = __guest_enter(vcpu); + if (unlikely(!handle_aarch32_guest(vcpu, &exit_code))) + break; + /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c new file mode 100644 index 000000000000..3787ee6fb1a2 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -0,0 +1,487 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Google LLC + * Author: Fuad Tabba <tabba@google.com> + */ + +#include <linux/irqchip/arm-gic-v3.h> + +#include <asm/kvm_asm.h> +#include <asm/kvm_mmu.h> + +#include <hyp/adjust_pc.h> + +#include <nvhe/fixed_config.h> + +#include "../../sys_regs.h" + +/* + * Copies of the host's CPU features registers holding sanitized values at hyp. + */ +u64 id_aa64pfr0_el1_sys_val; +u64 id_aa64pfr1_el1_sys_val; +u64 id_aa64isar0_el1_sys_val; +u64 id_aa64isar1_el1_sys_val; +u64 id_aa64mmfr0_el1_sys_val; +u64 id_aa64mmfr1_el1_sys_val; +u64 id_aa64mmfr2_el1_sys_val; + +/* + * Inject an unknown/undefined exception to an AArch64 guest while most of its + * sysregs are live. + */ +static void inject_undef64(struct kvm_vcpu *vcpu) +{ + u32 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); + + *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); + *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); + + vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | + KVM_ARM64_EXCEPT_AA64_ELx_SYNC | + KVM_ARM64_PENDING_EXCEPTION); + + __kvm_adjust_pc(vcpu); + + write_sysreg_el1(esr, SYS_ESR); + write_sysreg_el1(read_sysreg_el2(SYS_ELR), SYS_ELR); + write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); +} + +/* + * Returns the restricted features values of the feature register based on the + * limitations in restrict_fields. + * A feature id field value of 0b0000 does not impose any restrictions. + * Note: Use only for unsigned feature field values. + */ +static u64 get_restricted_features_unsigned(u64 sys_reg_val, + u64 restrict_fields) +{ + u64 value = 0UL; + u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); + + /* + * According to the Arm Architecture Reference Manual, feature fields + * use increasing values to indicate increases in functionality. + * Iterate over the restricted feature fields and calculate the minimum + * unsigned value between the one supported by the system, and what the + * value is being restricted to. + */ + while (sys_reg_val && restrict_fields) { + value |= min(sys_reg_val & mask, restrict_fields & mask); + sys_reg_val &= ~mask; + restrict_fields &= ~mask; + mask <<= ARM64_FEATURE_FIELD_BITS; + } + + return value; +} + +/* + * Functions that return the value of feature id registers for protected VMs + * based on allowed features, system features, and KVM support. + */ + +static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 set_mask = 0; + u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; + + if (!vcpu_has_sve(vcpu)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + + set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + + /* Spectre and Meltdown mitigation in KVM */ + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), + (u64)kvm->arch.pfr0_csv2); + set_mask |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), + (u64)kvm->arch.pfr0_csv3); + + return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; +} + +static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) +{ + const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); + u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; + + if (!kvm_has_mte(kvm)) + allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); + + return id_aa64pfr1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for Scalable Vectors, therefore, hyp has no sanitized + * copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, including breakpoints, and watchpoints, + * therefore, pKVM has no sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for debug, therefore, hyp has no sanitized copy of the + * feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) +{ + /* + * No support for implementation defined features, therefore, hyp has no + * sanitized copy of the feature id register. + */ + BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); + return 0; +} + +static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) +{ + return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; +} + +static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) +{ + u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; + + if (!vcpu_has_ptrauth(vcpu)) + allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); + + return id_aa64isar1_el1_sys_val & allow_mask; +} + +static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) +{ + u64 set_mask; + + set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, + PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + + return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; +} + +static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; +} + +static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) +{ + return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; +} + +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_pvm_id_aa64pfr0(vcpu); + case SYS_ID_AA64PFR1_EL1: + return get_pvm_id_aa64pfr1(vcpu); + case SYS_ID_AA64ZFR0_EL1: + return get_pvm_id_aa64zfr0(vcpu); + case SYS_ID_AA64DFR0_EL1: + return get_pvm_id_aa64dfr0(vcpu); + case SYS_ID_AA64DFR1_EL1: + return get_pvm_id_aa64dfr1(vcpu); + case SYS_ID_AA64AFR0_EL1: + return get_pvm_id_aa64afr0(vcpu); + case SYS_ID_AA64AFR1_EL1: + return get_pvm_id_aa64afr1(vcpu); + case SYS_ID_AA64ISAR0_EL1: + return get_pvm_id_aa64isar0(vcpu); + case SYS_ID_AA64ISAR1_EL1: + return get_pvm_id_aa64isar1(vcpu); + case SYS_ID_AA64MMFR0_EL1: + return get_pvm_id_aa64mmfr0(vcpu); + case SYS_ID_AA64MMFR1_EL1: + return get_pvm_id_aa64mmfr1(vcpu); + case SYS_ID_AA64MMFR2_EL1: + return get_pvm_id_aa64mmfr2(vcpu); + default: + /* + * Should never happen because all cases are covered in + * pvm_sys_reg_descs[]. + */ + WARN_ON(1); + break; + } + + return 0; +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, + struct sys_reg_desc const *r) +{ + return pvm_read_id_reg(vcpu, reg_to_encoding(r)); +} + +/* Handler to RAZ/WI sysregs */ +static bool pvm_access_raz_wi(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (!p->is_write) + p->regval = 0; + + return true; +} + +/* + * Accessor for AArch32 feature id registers. + * + * The value of these registers is "unknown" according to the spec if AArch32 + * isn't supported. + */ +static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + /* + * No support for AArch32 guests, therefore, pKVM has no sanitized copy + * of AArch32 feature id registers. + */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1), + PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_ELx_64BIT_ONLY); + + return pvm_access_raz_wi(vcpu, p, r); +} + +/* + * Accessor for AArch64 feature id registers. + * + * If access is allowed, set the regval to the protected VM's view of the + * register and return true. + * Otherwise, inject an undefined exception and return false. + */ +static bool pvm_access_id_aarch64(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + if (p->is_write) { + inject_undef64(vcpu); + return false; + } + + p->regval = read_id_reg(vcpu, r); + return true; +} + +static bool pvm_gic_read_sre(struct kvm_vcpu *vcpu, + struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + /* pVMs only support GICv3. 'nuf said. */ + if (!p->is_write) + p->regval = ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB | ICC_SRE_EL1_SRE; + + return true; +} + +/* Mark the specified system register as an AArch32 feature id register. */ +#define AARCH32(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch32 } + +/* Mark the specified system register as an AArch64 feature id register. */ +#define AARCH64(REG) { SYS_DESC(REG), .access = pvm_access_id_aarch64 } + +/* Mark the specified system register as Read-As-Zero/Write-Ignored */ +#define RAZ_WI(REG) { SYS_DESC(REG), .access = pvm_access_raz_wi } + +/* Mark the specified system register as not being handled in hyp. */ +#define HOST_HANDLED(REG) { SYS_DESC(REG), .access = NULL } + +/* + * Architected system registers. + * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 + * + * NOTE: Anything not explicitly listed here is *restricted by default*, i.e., + * it will lead to injecting an exception into the guest. + */ +static const struct sys_reg_desc pvm_sys_reg_descs[] = { + /* Cache maintenance by set/way operations are restricted. */ + + /* Debug and Trace Registers are restricted. */ + + /* AArch64 mappings of the AArch32 ID registers */ + /* CRm=1 */ + AARCH32(SYS_ID_PFR0_EL1), + AARCH32(SYS_ID_PFR1_EL1), + AARCH32(SYS_ID_DFR0_EL1), + AARCH32(SYS_ID_AFR0_EL1), + AARCH32(SYS_ID_MMFR0_EL1), + AARCH32(SYS_ID_MMFR1_EL1), + AARCH32(SYS_ID_MMFR2_EL1), + AARCH32(SYS_ID_MMFR3_EL1), + + /* CRm=2 */ + AARCH32(SYS_ID_ISAR0_EL1), + AARCH32(SYS_ID_ISAR1_EL1), + AARCH32(SYS_ID_ISAR2_EL1), + AARCH32(SYS_ID_ISAR3_EL1), + AARCH32(SYS_ID_ISAR4_EL1), + AARCH32(SYS_ID_ISAR5_EL1), + AARCH32(SYS_ID_MMFR4_EL1), + AARCH32(SYS_ID_ISAR6_EL1), + + /* CRm=3 */ + AARCH32(SYS_MVFR0_EL1), + AARCH32(SYS_MVFR1_EL1), + AARCH32(SYS_MVFR2_EL1), + AARCH32(SYS_ID_PFR2_EL1), + AARCH32(SYS_ID_DFR1_EL1), + AARCH32(SYS_ID_MMFR5_EL1), + + /* AArch64 ID registers */ + /* CRm=4 */ + AARCH64(SYS_ID_AA64PFR0_EL1), + AARCH64(SYS_ID_AA64PFR1_EL1), + AARCH64(SYS_ID_AA64ZFR0_EL1), + AARCH64(SYS_ID_AA64DFR0_EL1), + AARCH64(SYS_ID_AA64DFR1_EL1), + AARCH64(SYS_ID_AA64AFR0_EL1), + AARCH64(SYS_ID_AA64AFR1_EL1), + AARCH64(SYS_ID_AA64ISAR0_EL1), + AARCH64(SYS_ID_AA64ISAR1_EL1), + AARCH64(SYS_ID_AA64MMFR0_EL1), + AARCH64(SYS_ID_AA64MMFR1_EL1), + AARCH64(SYS_ID_AA64MMFR2_EL1), + + /* Scalable Vector Registers are restricted. */ + + RAZ_WI(SYS_ERRIDR_EL1), + RAZ_WI(SYS_ERRSELR_EL1), + RAZ_WI(SYS_ERXFR_EL1), + RAZ_WI(SYS_ERXCTLR_EL1), + RAZ_WI(SYS_ERXSTATUS_EL1), + RAZ_WI(SYS_ERXADDR_EL1), + RAZ_WI(SYS_ERXMISC0_EL1), + RAZ_WI(SYS_ERXMISC1_EL1), + + /* Performance Monitoring Registers are restricted. */ + + /* Limited Ordering Regions Registers are restricted. */ + + HOST_HANDLED(SYS_ICC_SGI1R_EL1), + HOST_HANDLED(SYS_ICC_ASGI1R_EL1), + HOST_HANDLED(SYS_ICC_SGI0R_EL1), + { SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, }, + + HOST_HANDLED(SYS_CCSIDR_EL1), + HOST_HANDLED(SYS_CLIDR_EL1), + HOST_HANDLED(SYS_CSSELR_EL1), + HOST_HANDLED(SYS_CTR_EL0), + + /* Performance Monitoring Registers are restricted. */ + + /* Activity Monitoring Registers are restricted. */ + + HOST_HANDLED(SYS_CNTP_TVAL_EL0), + HOST_HANDLED(SYS_CNTP_CTL_EL0), + HOST_HANDLED(SYS_CNTP_CVAL_EL0), + + /* Performance Monitoring Registers are restricted. */ +}; + +/* + * Checks that the sysreg table is unique and in-order. + * + * Returns 0 if the table is consistent, or 1 otherwise. + */ +int kvm_check_pvm_sysreg_table(void) +{ + unsigned int i; + + for (i = 1; i < ARRAY_SIZE(pvm_sys_reg_descs); i++) { + if (cmp_sys_reg(&pvm_sys_reg_descs[i-1], &pvm_sys_reg_descs[i]) >= 0) + return 1; + } + + return 0; +} + +/* + * Handler for protected VM MSR, MRS or System instruction execution. + * + * Returns true if the hypervisor has handled the exit, and control should go + * back to the guest, or false if it hasn't, to be handled by the host. + */ +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + const struct sys_reg_desc *r; + struct sys_reg_params params; + unsigned long esr = kvm_vcpu_get_esr(vcpu); + int Rt = kvm_vcpu_sys_get_rt(vcpu); + + params = esr_sys64_to_params(esr); + params.regval = vcpu_get_reg(vcpu, Rt); + + r = find_reg(¶ms, pvm_sys_reg_descs, ARRAY_SIZE(pvm_sys_reg_descs)); + + /* Undefined (RESTRICTED). */ + if (r == NULL) { + inject_undef64(vcpu); + return true; + } + + /* Handled by the host (HOST_HANDLED) */ + if (r->access == NULL) + return false; + + /* Handled by hyp: skip instruction if instructed to do so. */ + if (r->access(vcpu, ¶ms, r)) + __kvm_skip_instr(vcpu); + + if (!params.is_write) + vcpu_set_reg(vcpu, Rt, params.regval); + + return true; +} + +/** + * Handler for protected VM restricted exceptions. + * + * Inject an undefined exception into the guest and return true to indicate that + * the hypervisor has handled the exit, and control should go back to the guest. + */ +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + inject_undef64(vcpu); + return true; +} diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 39f8f7f9227c..20db2f281cf2 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -695,9 +695,7 @@ static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt) goto spurious; lr_val &= ~ICH_LR_STATE; - /* No active state for LPIs */ - if ((lr_val & ICH_LR_VIRTUAL_ID_MASK) <= VGIC_MAX_SPI) - lr_val |= ICH_LR_ACTIVE_BIT; + lr_val |= ICH_LR_ACTIVE_BIT; __gic_v3_set_lr(lr_val, lr); __vgic_v3_set_active_priority(lr_prio, vmcr, grp); vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK); @@ -764,20 +762,18 @@ static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt) /* Drop priority in any case */ act_prio = __vgic_v3_clear_highest_active_priority(); - /* If EOIing an LPI, no deactivate to be performed */ - if (vid >= VGIC_MIN_LPI) - return; - - /* EOImode == 1, nothing to be done here */ - if (vmcr & ICH_VMCR_EOIM_MASK) - return; - lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val); if (lr == -1) { - __vgic_v3_bump_eoicount(); + /* Do not bump EOIcount for LPIs that aren't in the LRs */ + if (!(vid >= VGIC_MIN_LPI)) + __vgic_v3_bump_eoicount(); return; } + /* EOImode == 1 and not an LPI, nothing to be done here */ + if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI)) + return; + lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT; /* If priorities or group do not match, the guest has fscked-up. */ @@ -987,8 +983,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - val |= ((vtr >> 22) & 1) << ICC_CTLR_EL1_SEIS_SHIFT; /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index ded2c66675f0..5a2cb5d9bc4b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -96,6 +96,22 @@ void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) __deactivate_traps_common(vcpu); } +static const exit_handler_fn hyp_exit_handlers[] = { + [0 ... ESR_ELx_EC_MAX] = NULL, + [ESR_ELx_EC_CP15_32] = kvm_hyp_handle_cp15_32, + [ESR_ELx_EC_SYS64] = kvm_hyp_handle_sysreg, + [ESR_ELx_EC_SVE] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_FP_ASIMD] = kvm_hyp_handle_fpsimd, + [ESR_ELx_EC_IABT_LOW] = kvm_hyp_handle_iabt_low, + [ESR_ELx_EC_DABT_LOW] = kvm_hyp_handle_dabt_low, + [ESR_ELx_EC_PAC] = kvm_hyp_handle_ptrauth, +}; + +static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) +{ + return hyp_exit_handlers; +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 69bd1732a299..326cdfec74a1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -512,7 +512,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return -EINVAL; } - pgt = kzalloc(sizeof(*pgt), GFP_KERNEL); + pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT); if (!pgt) return -ENOMEM; diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 2af3c37445e0..a5e4bbf5e68f 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -978,7 +978,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) mutex_lock(&vcpu->kvm->lock); if (!vcpu->kvm->arch.pmu_filter) { - vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL); + vcpu->kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); if (!vcpu->kvm->arch.pmu_filter) { mutex_unlock(&vcpu->kvm->lock); return -ENOMEM; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5ce36b0a3343..acad22ebac12 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -106,7 +106,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) vl > SVE_VL_ARCH_MAX)) return -EIO; - buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL); + buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL_ACCOUNT); if (!buf) return -ENOMEM; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1d46e185f31e..e3ec1a44f94d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1064,7 +1064,12 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) { u32 id = reg_to_encoding(r); - u64 val = raz ? 0 : read_sanitised_ftr_reg(id); + u64 val; + + if (raz) + return 0; + + val = read_sanitised_ftr_reg(id); switch (id) { case SYS_ID_AA64PFR0_EL1: @@ -1075,16 +1080,15 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + if (irqchip_in_kernel(vcpu->kvm) && + vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_GIC); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_GIC), 1); + } break; case SYS_ID_AA64PFR1_EL1: - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); - if (kvm_has_mte(vcpu->kvm)) { - u64 pfr, mte; - - pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); - mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); - } + if (!kvm_has_mte(vcpu->kvm)) + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) @@ -1268,16 +1272,19 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, raz); } -static int get_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, +static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - return __get_id_reg(vcpu, rd, uaddr, true); + return __set_id_reg(vcpu, rd, uaddr, true); } -static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) +static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) { - return __set_id_reg(vcpu, rd, uaddr, true); + const u64 id = sys_reg_to_index(rd); + const u64 val = 0; + + return reg_to_user(uaddr, &val, id); } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1388,7 +1395,7 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, #define ID_UNALLOCATED(crm, op2) { \ Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2), \ .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ + .get_user = get_raz_reg, \ .set_user = set_raz_id_reg, \ } @@ -1400,7 +1407,7 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu, #define ID_HIDDEN(name) { \ SYS_DESC(SYS_##name), \ .access = access_raz_id_reg, \ - .get_user = get_raz_id_reg, \ + .get_user = get_raz_reg, \ .set_user = set_raz_id_reg, \ } @@ -1642,7 +1649,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { * previously (and pointlessly) advertised in the past... */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .get_user = get_raz_reg, .set_user = set_wi_reg, .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index 340c51d87677..0a06d0648970 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -134,7 +134,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); + dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!dist->spis) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-irqfd.c b/arch/arm64/kvm/vgic/vgic-irqfd.c index 79f8899b234c..475059bacedf 100644 --- a/arch/arm64/kvm/vgic/vgic-irqfd.c +++ b/arch/arm64/kvm/vgic/vgic-irqfd.c @@ -139,7 +139,7 @@ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) u32 nr = dist->nr_spis; int i, ret; - entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL_ACCOUNT); if (!entries) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 61728c543eb9..089fc2ffcb43 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -48,7 +48,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, if (irq) return irq; - irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); + irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT); if (!irq) return ERR_PTR(-ENOMEM); @@ -332,7 +332,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) * we must be careful not to overrun the array. */ irq_count = READ_ONCE(dist->lpi_list_count); - intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL); + intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT); if (!intids) return -ENOMEM; @@ -985,7 +985,7 @@ static int vgic_its_alloc_collection(struct vgic_its *its, if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL)) return E_ITS_MAPC_COLLECTION_OOR; - collection = kzalloc(sizeof(*collection), GFP_KERNEL); + collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT); if (!collection) return -ENOMEM; @@ -1029,7 +1029,7 @@ static struct its_ite *vgic_its_alloc_ite(struct its_device *device, { struct its_ite *ite; - ite = kzalloc(sizeof(*ite), GFP_KERNEL); + ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT); if (!ite) return ERR_PTR(-ENOMEM); @@ -1150,7 +1150,7 @@ static struct its_device *vgic_its_alloc_device(struct vgic_its *its, { struct its_device *device; - device = kzalloc(sizeof(*device), GFP_KERNEL); + device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT); if (!device) return ERR_PTR(-ENOMEM); @@ -1847,7 +1847,7 @@ void vgic_lpi_translation_cache_init(struct kvm *kvm) struct vgic_translation_cache_entry *cte; /* An allocation failure is not fatal */ - cte = kzalloc(sizeof(*cte), GFP_KERNEL); + cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT); if (WARN_ON(!cte)) break; @@ -1888,7 +1888,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (type != KVM_DEV_TYPE_ARM_VGIC_ITS) return -ENODEV; - its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL); + its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT); if (!its) return -ENOMEM; @@ -2710,8 +2710,8 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (copy_from_user(&addr, uaddr, sizeof(addr))) return -EFAULT; - ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base, - addr, SZ_64K); + ret = vgic_check_iorange(dev->kvm, its->vgic_its_base, + addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE); if (ret) return ret; diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 7740995de982..0d000d2fe8d2 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -14,17 +14,21 @@ /* common helpers */ -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment) +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size) { - if (addr & ~kvm_phys_mask(kvm)) - return -E2BIG; + if (!IS_VGIC_ADDR_UNDEF(ioaddr)) + return -EEXIST; - if (!IS_ALIGNED(addr, alignment)) + if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) return -EINVAL; - if (!IS_VGIC_ADDR_UNDEF(*ioaddr)) - return -EEXIST; + if (addr + size < addr) + return -EINVAL; + + if (addr & ~kvm_phys_mask(kvm) || addr + size > kvm_phys_size(kvm)) + return -E2BIG; return 0; } @@ -57,7 +61,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) { int r = 0; struct vgic_dist *vgic = &kvm->arch.vgic; - phys_addr_t *addr_ptr, alignment; + phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; mutex_lock(&kvm->lock); @@ -66,16 +70,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_4K; + size = KVM_VGIC_V2_DIST_SIZE; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; + size = KVM_VGIC_V2_CPU_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_64K; + size = KVM_VGIC_V3_DIST_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_REDIST: { struct vgic_redist_region *rdreg; @@ -140,7 +147,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; if (write) { - r = vgic_check_ioaddr(kvm, addr_ptr, *addr, alignment); + r = vgic_check_iorange(kvm, *addr_ptr, *addr, alignment, size); if (!r) *addr_ptr = *addr; } else { diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a09cdc0b953c..bf7ec4a78497 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -796,7 +796,9 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, struct vgic_dist *d = &kvm->arch.vgic; struct vgic_redist_region *rdreg; struct list_head *rd_regions = &d->rd_regions; - size_t size = count * KVM_VGIC_V3_REDIST_SIZE; + int nr_vcpus = atomic_read(&kvm->online_vcpus); + size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE + : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE; int ret; /* cross the end of memory ? */ @@ -834,13 +836,13 @@ static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, if (vgic_v3_rdist_overlap(kvm, base, size)) return -EINVAL; - rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL); + rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT); if (!rdreg) return -ENOMEM; rdreg->base = VGIC_ADDR_UNDEF; - ret = vgic_check_ioaddr(kvm, &rdreg->base, base, SZ_64K); + ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size); if (ret) goto free; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 21a6207fb2ee..04f62c4b07fb 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -15,6 +15,7 @@ static bool group0_trap; static bool group1_trap; static bool common_trap; +static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) @@ -296,6 +297,8 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_hcr |= ICH_HCR_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_TC; + if (dir_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -483,8 +486,10 @@ bool vgic_v3_check_base(struct kvm *kvm) return false; list_for_each_entry(rdreg, &d->rd_regions, list) { - if (rdreg->base + vgic_v3_rd_region_size(kvm, rdreg) < - rdreg->base) + size_t sz = vgic_v3_rd_region_size(kvm, rdreg); + + if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF, + rdreg->base, SZ_64K, sz)) return false; } @@ -671,11 +676,23 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group1_trap = true; } - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) { + kvm_info("GICv3 with locally generated SEI\n"); + + group0_trap = true; + group1_trap = true; + if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + dir_trap = true; + else + common_trap = true; + } + + if (group0_trap || group1_trap || common_trap | dir_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", - common_trap ? "C" : ""); + common_trap ? "C" : "", + dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index c1845d8f5f7e..772dd15a22c7 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -246,7 +246,7 @@ int vgic_v4_init(struct kvm *kvm) nr_vcpus = atomic_read(&kvm->online_vcpus); dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dist->its_vm.vpes) return -ENOMEM; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 14a9218641f5..3fd6c86a7ef3 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -172,8 +172,9 @@ void vgic_kick_vcpus(struct kvm *kvm); void vgic_irq_handle_resampling(struct vgic_irq *irq, bool lr_deactivated, bool lr_pending); -int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, - phys_addr_t addr, phys_addr_t alignment); +int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, + phys_addr_t addr, phys_addr_t alignment, + phys_addr_t size); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index d7bee210a798..83bcad72ec97 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -173,4 +173,4 @@ L(done): ret SYM_FUNC_END_PI(strcmp) -EXPORT_SYMBOL_NOKASAN(strcmp) +EXPORT_SYMBOL_NOHWKASAN(strcmp) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index 48d44f7fddb1..e42bcfcd37e6 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -258,4 +258,4 @@ L(ret0): ret SYM_FUNC_END_PI(strncmp) -EXPORT_SYMBOL_NOKASAN(strncmp) +EXPORT_SYMBOL_NOHWKASAN(strncmp) diff --git a/arch/m68k/68000/entry.S b/arch/m68k/68000/entry.S index 259b3661b614..997b54933015 100644 --- a/arch/m68k/68000/entry.S +++ b/arch/m68k/68000/entry.S @@ -15,7 +15,6 @@ #include <asm/unistd.h> #include <asm/errno.h> #include <asm/setup.h> -#include <asm/segment.h> #include <asm/traps.h> #include <asm/asm-offsets.h> #include <asm/entry.h> @@ -25,7 +24,6 @@ .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl bad_interrupt .globl inthandler1 @@ -59,8 +57,6 @@ do_trace: subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp jra ret_from_exception diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 774c35f47eea..0b50da08a9c5 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -29,7 +29,6 @@ config M68K select NO_DMA if !MMU && !COLDFIRE select OLD_SIGACTION select OLD_SIGSUSPEND3 - select SET_FS select UACCESS_MEMCPY if !MMU select VIRT_TO_BUS select ZONE_DMA diff --git a/arch/m68k/coldfire/entry.S b/arch/m68k/coldfire/entry.S index d43a02795a4a..9f337c70243a 100644 --- a/arch/m68k/coldfire/entry.S +++ b/arch/m68k/coldfire/entry.S @@ -31,7 +31,6 @@ #include <asm/thread_info.h> #include <asm/errno.h> #include <asm/setup.h> -#include <asm/segment.h> #include <asm/asm-offsets.h> #include <asm/entry.h> @@ -51,7 +50,6 @@ sw_usp: .globl system_call .globl resume .globl ret_from_exception -.globl ret_from_signal .globl sys_call_table .globl inthandler @@ -98,8 +96,6 @@ ENTRY(system_call) subql #4,%sp /* dummy return address */ SAVE_SWITCH_STACK jbsr syscall_trace_leave - -ret_from_signal: RESTORE_SWITCH_STACK addql #4,%sp diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index 3750819ac5a1..f4d82c619a5c 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -9,7 +9,6 @@ #define __ASM_M68K_PROCESSOR_H #include <linux/thread_info.h> -#include <asm/segment.h> #include <asm/fpu.h> #include <asm/ptrace.h> @@ -75,11 +74,37 @@ static inline void wrusp(unsigned long usp) #define TASK_UNMAPPED_BASE 0 #endif +/* Address spaces (or Function Codes in Motorola lingo) */ +#define USER_DATA 1 +#define USER_PROGRAM 2 +#define SUPER_DATA 5 +#define SUPER_PROGRAM 6 +#define CPU_SPACE 7 + +#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES +/* + * Set the SFC/DFC registers for special MM operations. For most normal + * operation these remain set to USER_DATA for the uaccess routines. + */ +static inline void set_fc(unsigned long val) +{ + WARN_ON_ONCE(in_interrupt()); + + __asm__ __volatile__ ("movec %0,%/sfc\n\t" + "movec %0,%/dfc\n\t" + : /* no outputs */ : "r" (val) : "memory"); +} +#else +static inline void set_fc(unsigned long val) +{ +} +#endif /* CONFIG_CPU_HAS_ADDRESS_SPACES */ + struct thread_struct { unsigned long ksp; /* kernel stack pointer */ unsigned long usp; /* user stack pointer */ unsigned short sr; /* saved status register */ - unsigned short fs; /* saved fs (sfc, dfc) */ + unsigned short fc; /* saved fc (sfc, dfc) */ unsigned long crp[2]; /* cpu root pointer */ unsigned long esp0; /* points to SR of stack frame */ unsigned long faddr; /* info about last fault */ @@ -92,7 +117,7 @@ struct thread_struct { #define INIT_THREAD { \ .ksp = sizeof(init_stack) + (unsigned long) init_stack, \ .sr = PS_S, \ - .fs = __KERNEL_DS, \ + .fc = USER_DATA, \ } /* diff --git a/arch/m68k/include/asm/segment.h b/arch/m68k/include/asm/segment.h deleted file mode 100644 index 2b5e68a71ef7..000000000000 --- a/arch/m68k/include/asm/segment.h +++ /dev/null @@ -1,59 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _M68K_SEGMENT_H -#define _M68K_SEGMENT_H - -/* define constants */ -/* Address spaces (FC0-FC2) */ -#define USER_DATA (1) -#ifndef __USER_DS -#define __USER_DS (USER_DATA) -#endif -#define USER_PROGRAM (2) -#define SUPER_DATA (5) -#ifndef __KERNEL_DS -#define __KERNEL_DS (SUPER_DATA) -#endif -#define SUPER_PROGRAM (6) -#define CPU_SPACE (7) - -#ifndef __ASSEMBLY__ - -typedef struct { - unsigned long seg; -} mm_segment_t; - -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) - -#ifdef CONFIG_CPU_HAS_ADDRESS_SPACES -/* - * Get/set the SFC/DFC registers for MOVES instructions - */ -#define USER_DS MAKE_MM_SEG(__USER_DS) -#define KERNEL_DS MAKE_MM_SEG(__KERNEL_DS) - -static inline mm_segment_t get_fs(void) -{ - mm_segment_t _v; - __asm__ ("movec %/dfc,%0":"=r" (_v.seg):); - return _v; -} - -static inline void set_fs(mm_segment_t val) -{ - __asm__ __volatile__ ("movec %0,%/sfc\n\t" - "movec %0,%/dfc\n\t" - : /* no outputs */ : "r" (val.seg) : "memory"); -} - -#else -#define USER_DS MAKE_MM_SEG(TASK_SIZE) -#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) -#endif - -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - -#endif /* __ASSEMBLY__ */ - -#endif /* _M68K_SEGMENT_H */ diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 15a757073fa5..c952658ba792 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -4,7 +4,6 @@ #include <asm/types.h> #include <asm/page.h> -#include <asm/segment.h> /* * On machines with 4k pages we default to an 8k thread size, though we @@ -27,7 +26,6 @@ struct thread_info { struct task_struct *task; /* main task structure */ unsigned long flags; - mm_segment_t addr_limit; /* thread address space */ int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ unsigned long tp_value; /* thread pointer */ @@ -37,7 +35,6 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ - .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ } diff --git a/arch/m68k/include/asm/tlbflush.h b/arch/m68k/include/asm/tlbflush.h index a6318ccd308f..b882e2f4f551 100644 --- a/arch/m68k/include/asm/tlbflush.h +++ b/arch/m68k/include/asm/tlbflush.h @@ -13,13 +13,12 @@ static inline void flush_tlb_kernel_page(void *addr) if (CPU_IS_COLDFIRE) { mmu_write(MMUOR, MMUOR_CNL); } else if (CPU_IS_040_OR_060) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); __asm__ __volatile__(".chip 68040\n\t" "pflush (%0)\n\t" ".chip 68k" : : "a" (addr)); - set_fs(old_fs); + set_fc(USER_DATA); } else if (CPU_IS_020_OR_030) __asm__ __volatile__("pflush #4,#4,(%0)" : : "a" (addr)); } @@ -84,12 +83,8 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) { - mm_segment_t old_fs = force_uaccess_begin(); - + if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); - force_uaccess_end(old_fs); - } } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/m68k/include/asm/traps.h b/arch/m68k/include/asm/traps.h index 4aff3358fbaf..a9d5c1c870d3 100644 --- a/arch/m68k/include/asm/traps.h +++ b/arch/m68k/include/asm/traps.h @@ -267,6 +267,10 @@ struct frame { } un; }; +#ifdef CONFIG_M68040 +asmlinkage void berr_040cleanup(struct frame *fp); +#endif + #endif /* __ASSEMBLY__ */ #endif /* _M68K_TRAPS_H */ diff --git a/arch/m68k/include/asm/uaccess.h b/arch/m68k/include/asm/uaccess.h index f98208ccbbcd..ba670523885c 100644 --- a/arch/m68k/include/asm/uaccess.h +++ b/arch/m68k/include/asm/uaccess.h @@ -9,13 +9,16 @@ */ #include <linux/compiler.h> #include <linux/types.h> -#include <asm/segment.h> #include <asm/extable.h> /* We let the MMU do all checking */ static inline int access_ok(const void __user *addr, unsigned long size) { + /* + * XXX: for !CONFIG_CPU_HAS_ADDRESS_SPACES this really needs to check + * for TASK_SIZE! + */ return 1; } @@ -35,12 +38,9 @@ static inline int access_ok(const void __user *addr, #define MOVES "move" #endif -extern int __put_user_bad(void); -extern int __get_user_bad(void); - -#define __put_user_asm(res, x, ptr, bwl, reg, err) \ +#define __put_user_asm(inst, res, x, ptr, bwl, reg, err) \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -56,6 +56,31 @@ asm volatile ("\n" \ : "+d" (res), "=m" (*(ptr)) \ : #reg (x), "i" (err)) +#define __put_user_asm8(inst, res, x, ptr) \ +do { \ + const void *__pu_ptr = (const void __force *)(ptr); \ + \ + asm volatile ("\n" \ + "1: "inst".l %2,(%1)+\n" \ + "2: "inst".l %R2,(%1)\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: movel %3,%0\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .long 3b,10b\n" \ + " .previous" \ + : "+d" (res), "+a" (__pu_ptr) \ + : "r" (x), "i" (-EFAULT) \ + : "memory"); \ +} while (0) + /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. @@ -68,51 +93,29 @@ asm volatile ("\n" \ __chk_user_ptr(ptr); \ switch (sizeof (*(ptr))) { \ case 1: \ - __put_user_asm(__pu_err, __pu_val, ptr, b, d, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, b, d, -EFAULT); \ break; \ case 2: \ - __put_user_asm(__pu_err, __pu_val, ptr, w, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, w, r, -EFAULT); \ break; \ case 4: \ - __put_user_asm(__pu_err, __pu_val, ptr, l, r, -EFAULT); \ + __put_user_asm(MOVES, __pu_err, __pu_val, ptr, l, r, -EFAULT); \ break; \ case 8: \ - { \ - const void __user *__pu_ptr = (ptr); \ - asm volatile ("\n" \ - "1: "MOVES".l %2,(%1)+\n" \ - "2: "MOVES".l %R2,(%1)\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: movel %3,%0\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .long 3b,10b\n" \ - " .previous" \ - : "+d" (__pu_err), "+a" (__pu_ptr) \ - : "r" (__pu_val), "i" (-EFAULT) \ - : "memory"); \ + __put_user_asm8(MOVES, __pu_err, __pu_val, ptr); \ break; \ - } \ default: \ - __pu_err = __put_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __pu_err; \ }) #define put_user(x, ptr) __put_user(x, ptr) -#define __get_user_asm(res, x, ptr, type, bwl, reg, err) ({ \ +#define __get_user_asm(inst, res, x, ptr, type, bwl, reg, err) ({ \ type __gu_val; \ asm volatile ("\n" \ - "1: "MOVES"."#bwl" %2,%1\n" \ + "1: "inst"."#bwl" %2,%1\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .even\n" \ @@ -130,53 +133,57 @@ asm volatile ("\n" \ (x) = (__force typeof(*(ptr)))(__force unsigned long)__gu_val; \ }) +#define __get_user_asm8(inst, res, x, ptr) \ +do { \ + const void *__gu_ptr = (const void __force *)(ptr); \ + union { \ + u64 l; \ + __typeof__(*(ptr)) t; \ + } __gu_val; \ + \ + asm volatile ("\n" \ + "1: "inst".l (%2)+,%1\n" \ + "2: "inst".l (%2),%R1\n" \ + "3:\n" \ + " .section .fixup,\"ax\"\n" \ + " .even\n" \ + "10: move.l %3,%0\n" \ + " sub.l %1,%1\n" \ + " sub.l %R1,%R1\n" \ + " jra 3b\n" \ + " .previous\n" \ + "\n" \ + " .section __ex_table,\"a\"\n" \ + " .align 4\n" \ + " .long 1b,10b\n" \ + " .long 2b,10b\n" \ + " .previous" \ + : "+d" (res), "=&r" (__gu_val.l), \ + "+a" (__gu_ptr) \ + : "i" (-EFAULT) \ + : "memory"); \ + (x) = __gu_val.t; \ +} while (0) + #define __get_user(x, ptr) \ ({ \ int __gu_err = 0; \ __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm(__gu_err, x, ptr, u8, b, d, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u8, b, d, -EFAULT); \ break; \ case 2: \ - __get_user_asm(__gu_err, x, ptr, u16, w, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u16, w, r, -EFAULT); \ break; \ case 4: \ - __get_user_asm(__gu_err, x, ptr, u32, l, r, -EFAULT); \ + __get_user_asm(MOVES, __gu_err, x, ptr, u32, l, r, -EFAULT); \ break; \ - case 8: { \ - const void __user *__gu_ptr = (ptr); \ - union { \ - u64 l; \ - __typeof__(*(ptr)) t; \ - } __gu_val; \ - asm volatile ("\n" \ - "1: "MOVES".l (%2)+,%1\n" \ - "2: "MOVES".l (%2),%R1\n" \ - "3:\n" \ - " .section .fixup,\"ax\"\n" \ - " .even\n" \ - "10: move.l %3,%0\n" \ - " sub.l %1,%1\n" \ - " sub.l %R1,%R1\n" \ - " jra 3b\n" \ - " .previous\n" \ - "\n" \ - " .section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,10b\n" \ - " .long 2b,10b\n" \ - " .previous" \ - : "+d" (__gu_err), "=&r" (__gu_val.l), \ - "+a" (__gu_ptr) \ - : "i" (-EFAULT) \ - : "memory"); \ - (x) = __gu_val.t; \ + case 8: \ + __get_user_asm8(MOVES, __gu_err, x, ptr); \ break; \ - } \ default: \ - __gu_err = __get_user_bad(); \ - break; \ + BUILD_BUG(); \ } \ __gu_err; \ }) @@ -322,16 +329,19 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n) switch (n) { case 1: - __put_user_asm(res, *(u8 *)from, (u8 __user *)to, b, d, 1); + __put_user_asm(MOVES, res, *(u8 *)from, (u8 __user *)to, + b, d, 1); break; case 2: - __put_user_asm(res, *(u16 *)from, (u16 __user *)to, w, r, 2); + __put_user_asm(MOVES, res, *(u16 *)from, (u16 __user *)to, + w, r, 2); break; case 3: __constant_copy_to_user_asm(res, to, from, tmp, 3, w, b,); break; case 4: - __put_user_asm(res, *(u32 *)from, (u32 __user *)to, l, r, 4); + __put_user_asm(MOVES, res, *(u32 *)from, (u32 __user *)to, + l, r, 4); break; case 5: __constant_copy_to_user_asm(res, to, from, tmp, 5, l, b,); @@ -380,8 +390,65 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER -#define user_addr_max() \ - (uaccess_kernel() ? ~0UL : TASK_SIZE) +#define HAVE_GET_KERNEL_NOFAULT + +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + type *__gk_dst = (type *)(dst); \ + type *__gk_src = (type *)(src); \ + int __gk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u8, b, d, -EFAULT); \ + break; \ + case 2: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u16, w, r, -EFAULT); \ + break; \ + case 4: \ + __get_user_asm("move", __gk_err, *__gk_dst, __gk_src, \ + u32, l, r, -EFAULT); \ + break; \ + case 8: \ + __get_user_asm8("move", __gk_err, *__gk_dst, __gk_src); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__gk_err)) \ + goto err_label; \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + type __pk_src = *(type *)(src); \ + type *__pk_dst = (type *)(dst); \ + int __pk_err = 0; \ + \ + switch (sizeof(type)) { \ + case 1: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + b, d, -EFAULT); \ + break; \ + case 2: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + w, r, -EFAULT); \ + break; \ + case 4: \ + __put_user_asm("move", __pk_err, __pk_src, __pk_dst, \ + l, r, -EFAULT); \ + break; \ + case 8: \ + __put_user_asm8("move", __pk_err, __pk_src, __pk_dst); \ + break; \ + default: \ + BUILD_BUG(); \ + } \ + if (unlikely(__pk_err)) \ + goto err_label; \ +} while (0) extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); diff --git a/arch/m68k/kernel/asm-offsets.c b/arch/m68k/kernel/asm-offsets.c index ccea355052ef..906d73230537 100644 --- a/arch/m68k/kernel/asm-offsets.c +++ b/arch/m68k/kernel/asm-offsets.c @@ -31,7 +31,7 @@ int main(void) DEFINE(THREAD_KSP, offsetof(struct thread_struct, ksp)); DEFINE(THREAD_USP, offsetof(struct thread_struct, usp)); DEFINE(THREAD_SR, offsetof(struct thread_struct, sr)); - DEFINE(THREAD_FS, offsetof(struct thread_struct, fs)); + DEFINE(THREAD_FC, offsetof(struct thread_struct, fc)); DEFINE(THREAD_CRP, offsetof(struct thread_struct, crp)); DEFINE(THREAD_ESP0, offsetof(struct thread_struct, esp0)); DEFINE(THREAD_FPREG, offsetof(struct thread_struct, fp)); diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 9dd76fbb7c6b..9434fca68de5 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -36,7 +36,6 @@ #include <linux/linkage.h> #include <asm/errno.h> #include <asm/setup.h> -#include <asm/segment.h> #include <asm/traps.h> #include <asm/unistd.h> #include <asm/asm-offsets.h> @@ -78,20 +77,38 @@ ENTRY(__sys_clone3) ENTRY(sys_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- jbsr do_sigreturn - addql #8,%sp - RESTORE_SWITCH_STACK - rts + jra 1f | shared with rt_sigreturn() ENTRY(sys_rt_sigreturn) SAVE_SWITCH_STACK - movel %sp,%sp@- | switch_stack pointer - pea %sp@(SWITCH_STACK_SIZE+4) | pt_regs pointer + movel %sp,%a1 | switch_stack pointer + lea %sp@(SWITCH_STACK_SIZE),%a0 | pt_regs pointer + lea %sp@(-84),%sp | leave a gap + movel %a1,%sp@- + movel %a0,%sp@- + | stack contents: + | [original pt_regs address] [original switch_stack address] + | [gap] [switch_stack] [pt_regs] [exception frame] jbsr do_rt_sigreturn - addql #8,%sp + +1: + | stack contents now: + | [original pt_regs address] [original switch_stack address] + | [unused part of the gap] [moved switch_stack] [moved pt_regs] + | [replacement exception frame] + | return value of do_{rt_,}sigreturn() points to moved switch_stack. + + movel %d0,%sp | discard the leftover junk RESTORE_SWITCH_STACK + | stack contents now is just [syscall return address] [pt_regs] [frame] + | return pt_regs.d0 + movel %sp@(PT_OFF_D0+4),%d0 rts ENTRY(buserr) @@ -182,25 +199,6 @@ do_trace_exit: addql #4,%sp jra .Lret_from_exception -ENTRY(ret_from_signal) - movel %curptr@(TASK_STACK),%a1 - tstb %a1@(TINFO_FLAGS+2) - jge 1f - jbsr syscall_trace -1: RESTORE_SWITCH_STACK - addql #4,%sp -/* on 68040 complete pending writebacks if any */ -#ifdef CONFIG_M68040 - bfextu %sp@(PT_OFF_FORMATVEC){#0,#4},%d0 - subql #7,%d0 | bus error frame ? - jbne 1f - movel %sp,%sp@- - jbsr berr_040cleanup - addql #4,%sp -1: -#endif - jra .Lret_from_exception - ENTRY(system_call) SAVE_ALL_SYS @@ -338,7 +336,7 @@ resume: /* save fs (sfc,%dfc) (may be pointing to kernel memory) */ movec %sfc,%d0 - movew %d0,%a0@(TASK_THREAD+THREAD_FS) + movew %d0,%a0@(TASK_THREAD+THREAD_FC) /* save usp */ /* it is better to use a movel here instead of a movew 8*) */ @@ -424,7 +422,7 @@ resume: movel %a0,%usp /* restore fs (sfc,%dfc) */ - movew %a1@(TASK_THREAD+THREAD_FS),%a0 + movew %a1@(TASK_THREAD+THREAD_FC),%a0 movec %a0,%sfc movec %a0,%dfc diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index db49f9091711..1ab692b952cd 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -92,7 +92,7 @@ void show_regs(struct pt_regs * regs) void flush_thread(void) { - current->thread.fs = __USER_DS; + current->thread.fc = USER_DATA; #ifdef CONFIG_FPU if (!FPU_IS_EMU) { unsigned long zero = 0; @@ -155,7 +155,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, * Must save the current SFC/DFC value, NOT the value when * the parent was last descheduled - RGH 10-08-96 */ - p->thread.fs = get_fs().seg; + p->thread.fc = USER_DATA; if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 8f215e79e70e..338817d0cb3f 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -447,7 +447,7 @@ static inline void save_fpu_state(struct sigcontext *sc, struct pt_regs *regs) if (CPU_IS_060 ? sc->sc_fpstate[2] : sc->sc_fpstate[0]) { fpu_version = sc->sc_fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -510,7 +510,7 @@ static inline int rt_save_fpu_state(struct ucontext __user *uc, struct pt_regs * if (!(CPU_IS_060 || CPU_IS_COLDFIRE)) context_size = fpstate[1]; fpu_version = fpstate[0]; - if (CPU_IS_020_OR_030 && + if (CPU_IS_020_OR_030 && !regs->stkadj && regs->vector >= (VEC_FPBRUC * 4) && regs->vector <= (VEC_FPNAN * 4)) { /* Clear pending exception in 68882 idle frame */ @@ -641,56 +641,35 @@ static inline void siginfo_build_tests(void) static int mangle_kernel_stack(struct pt_regs *regs, int formatvec, void __user *fp) { - int fsize = frame_extra_sizes(formatvec >> 12); - if (fsize < 0) { + int extra = frame_extra_sizes(formatvec >> 12); + char buf[sizeof_field(struct frame, un)]; + + if (extra < 0) { /* * user process trying to return with weird frame format */ pr_debug("user process returning with weird frame format\n"); - return 1; + return -1; } - if (!fsize) { - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; - } else { - struct switch_stack *sw = (struct switch_stack *)regs - 1; - /* yes, twice as much as max(sizeof(frame.un.fmt<x>)) */ - unsigned long buf[sizeof_field(struct frame, un) / 2]; - - /* that'll make sure that expansion won't crap over data */ - if (copy_from_user(buf + fsize / 4, fp, fsize)) - return 1; - - /* point of no return */ - regs->format = formatvec >> 12; - regs->vector = formatvec & 0xfff; -#define frame_offset (sizeof(struct pt_regs)+sizeof(struct switch_stack)) - __asm__ __volatile__ ( -#ifdef CONFIG_COLDFIRE - " movel %0,%/sp\n\t" - " bra ret_from_signal\n" -#else - " movel %0,%/a0\n\t" - " subl %1,%/a0\n\t" /* make room on stack */ - " movel %/a0,%/sp\n\t" /* set stack pointer */ - /* move switch_stack and pt_regs */ - "1: movel %0@+,%/a0@+\n\t" - " dbra %2,1b\n\t" - " lea %/sp@(%c3),%/a0\n\t" /* add offset of fmt */ - " lsrl #2,%1\n\t" - " subql #1,%1\n\t" - /* copy to the gap we'd made */ - "2: movel %4@+,%/a0@+\n\t" - " dbra %1,2b\n\t" - " bral ret_from_signal\n" + if (extra && copy_from_user(buf, fp, extra)) + return -1; + regs->format = formatvec >> 12; + regs->vector = formatvec & 0xfff; + if (extra) { + void *p = (struct switch_stack *)regs - 1; + struct frame *new = (void *)regs - extra; + int size = sizeof(struct pt_regs)+sizeof(struct switch_stack); + + memmove(p - extra, p, size); + memcpy(p - extra + size, buf, extra); + current->thread.esp0 = (unsigned long)&new->ptregs; +#ifdef CONFIG_M68040 + /* on 68040 complete pending writebacks if any */ + if (new->ptregs.format == 7) // bus error frame + berr_040cleanup(new); #endif - : /* no outputs, it doesn't ever return */ - : "a" (sw), "d" (fsize), "d" (frame_offset/4-1), - "n" (frame_offset), "a" (buf + fsize/4) - : "a0"); -#undef frame_offset } - return 0; + return extra; } static inline int @@ -698,7 +677,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u { int formatvec; struct sigcontext context; - int err = 0; siginfo_build_tests(); @@ -707,7 +685,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u /* get previous context */ if (copy_from_user(&context, usc, sizeof(context))) - goto badframe; + return -1; /* restore passed registers */ regs->d0 = context.sc_d0; @@ -720,15 +698,10 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u wrusp(context.sc_usp); formatvec = context.sc_formatvec; - err = restore_fpu_state(&context); + if (restore_fpu_state(&context)) + return -1; - if (err || mangle_kernel_stack(regs, formatvec, fp)) - goto badframe; - - return 0; - -badframe: - return 1; + return mangle_kernel_stack(regs, formatvec, fp); } static inline int @@ -745,7 +718,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err = __get_user(temp, &uc->uc_mcontext.version); if (temp != MCONTEXT_VERSION) - goto badframe; + return -1; /* restore passed registers */ err |= __get_user(regs->d0, &gregs[0]); err |= __get_user(regs->d1, &gregs[1]); @@ -774,22 +747,17 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, err |= restore_altstack(&uc->uc_stack); if (err) - goto badframe; + return -1; - if (mangle_kernel_stack(regs, temp, &uc->uc_extra)) - goto badframe; - - return 0; - -badframe: - return 1; + return mangle_kernel_stack(regs, temp, &uc->uc_extra); } -asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct sigframe __user *frame = (struct sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -801,20 +769,22 @@ asmlinkage int do_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, frame + 1)) + size = restore_sigcontext(regs, &frame->sc, frame + 1); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; } -asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) +asmlinkage void *do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) { unsigned long usp = rdusp(); struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(usp - 4); sigset_t set; + int size; if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -823,27 +793,34 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs, struct switch_stack *sw) set_current_blocked(&set); - if (rt_restore_ucontext(regs, sw, &frame->uc)) + size = rt_restore_ucontext(regs, sw, &frame->uc); + if (size < 0) goto badframe; - return regs->d0; + return (void *)sw - size; badframe: force_sig(SIGSEGV); - return 0; + return sw; +} + +static inline struct pt_regs *rte_regs(struct pt_regs *regs) +{ + return (void *)regs + regs->stkadj; } static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask) { + struct pt_regs *tregs = rte_regs(regs); sc->sc_mask = mask; sc->sc_usp = rdusp(); sc->sc_d0 = regs->d0; sc->sc_d1 = regs->d1; sc->sc_a0 = regs->a0; sc->sc_a1 = regs->a1; - sc->sc_sr = regs->sr; - sc->sc_pc = regs->pc; - sc->sc_formatvec = regs->format << 12 | regs->vector; + sc->sc_sr = tregs->sr; + sc->sc_pc = tregs->pc; + sc->sc_formatvec = tregs->format << 12 | tregs->vector; save_a5_state(sc, regs); save_fpu_state(sc, regs); } @@ -851,6 +828,7 @@ static void setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs *regs) { struct switch_stack *sw = (struct switch_stack *)regs - 1; + struct pt_regs *tregs = rte_regs(regs); greg_t __user *gregs = uc->uc_mcontext.gregs; int err = 0; @@ -871,9 +849,9 @@ static inline int rt_setup_ucontext(struct ucontext __user *uc, struct pt_regs * err |= __put_user(sw->a5, &gregs[13]); err |= __put_user(sw->a6, &gregs[14]); err |= __put_user(rdusp(), &gregs[15]); - err |= __put_user(regs->pc, &gregs[16]); - err |= __put_user(regs->sr, &gregs[17]); - err |= __put_user((regs->format << 12) | regs->vector, &uc->uc_formatvec); + err |= __put_user(tregs->pc, &gregs[16]); + err |= __put_user(tregs->sr, &gregs[17]); + err |= __put_user((tregs->format << 12) | tregs->vector, &uc->uc_formatvec); err |= rt_save_fpu_state(uc, regs); return err; } @@ -890,13 +868,14 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); struct sigcontext context; int err = 0, sig = ksig->sig; if (fsize < 0) { pr_debug("setup_frame: Unknown frame format %#x\n", - regs->format); + tregs->format); return -EFAULT; } @@ -907,7 +886,7 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, err |= __put_user(sig, &frame->sig); - err |= __put_user(regs->vector, &frame->code); + err |= __put_user(tregs->vector, &frame->code); err |= __put_user(&frame->sc, &frame->psc); if (_NSIG_WORDS > 1) @@ -934,33 +913,27 @@ static int setup_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); + return 0; } @@ -968,7 +941,8 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - int fsize = frame_extra_sizes(regs->format); + struct pt_regs *tregs = rte_regs(regs); + int fsize = frame_extra_sizes(tregs->format); int err = 0, sig = ksig->sig; if (fsize < 0) { @@ -1019,33 +993,26 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, push_cache ((unsigned long) &frame->retcode); /* - * Set up registers for signal handler. All the state we are about - * to destroy is successfully copied to sigframe. - */ - wrusp ((unsigned long) frame); - regs->pc = (unsigned long) ksig->ka.sa.sa_handler; - adjustformat(regs); - - /* * This is subtle; if we build more than one sigframe, all but the * first one will see frame format 0 and have fsize == 0, so we won't * screw stkadj. */ - if (fsize) + if (fsize) { regs->stkadj = fsize; - - /* Prepare to skip over the extra stuff in the exception frame. */ - if (regs->stkadj) { - struct pt_regs *tregs = - (struct pt_regs *)((ulong)regs + regs->stkadj); + tregs = rte_regs(regs); pr_debug("Performing stackadjust=%04lx\n", regs->stkadj); - /* This must be copied with decreasing addresses to - handle overlaps. */ tregs->vector = 0; tregs->format = 0; - tregs->pc = regs->pc; tregs->sr = regs->sr; } + + /* + * Set up registers for signal handler. All the state we are about + * to destroy is successfully copied to sigframe. + */ + wrusp ((unsigned long) frame); + tregs->pc = (unsigned long) ksig->ka.sa.sa_handler; + adjustformat(regs); return 0; } diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 5b19fcdcd69e..9718ce94cc84 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -181,9 +181,8 @@ static inline void access_error060 (struct frame *fp) static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) { unsigned long mmusr; - mm_segment_t old_fs = get_fs(); - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); if (iswrite) asm volatile (".chip 68040; ptestw (%0); .chip 68k" : : "a" (addr)); @@ -192,7 +191,7 @@ static inline unsigned long probe040(int iswrite, unsigned long addr, int wbs) asm volatile (".chip 68040; movec %%mmusr,%0; .chip 68k" : "=r" (mmusr)); - set_fs(old_fs); + set_fc(USER_DATA); return mmusr; } @@ -201,10 +200,8 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, unsigned long wbd) { int res = 0; - mm_segment_t old_fs = get_fs(); - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(MAKE_MM_SEG(wbs)); + set_fc(wbs); switch (wbs & WBSIZ_040) { case BA_SIZE_BYTE: @@ -218,9 +215,7 @@ static inline int do_040writeback1(unsigned short wbs, unsigned long wba, break; } - /* set_fs can not be moved, otherwise put_user() may oops */ - set_fs(old_fs); - + set_fc(USER_DATA); pr_debug("do_040writeback1, res=%d\n", res); diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 90f4e9ca1276..4fab34791758 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -18,7 +18,6 @@ #include <linux/uaccess.h> #include <asm/io.h> -#include <asm/segment.h> #include <asm/setup.h> #include <asm/macintosh.h> #include <asm/mac_via.h> diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index b486c0889eec..dde978e66f14 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -49,24 +49,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) if (mmusr & MMU_R_040) return (mmusr & PAGE_MASK) | (vaddr & ~PAGE_MASK); } else { - unsigned short mmusr; - unsigned long *descaddr; - - asm volatile ("ptestr %3,%2@,#7,%0\n\t" - "pmove %%psr,%1" - : "=a&" (descaddr), "=m" (mmusr) - : "a" (vaddr), "d" (get_fs().seg)); - if (mmusr & (MMU_I|MMU_B|MMU_L)) - return 0; - descaddr = phys_to_virt((unsigned long)descaddr); - switch (mmusr & MMU_NUM) { - case 1: - return (*descaddr & 0xfe000000) | (vaddr & 0x01ffffff); - case 2: - return (*descaddr & 0xfffc0000) | (vaddr & 0x0003ffff); - case 3: - return (*descaddr & PAGE_MASK) | (vaddr & ~PAGE_MASK); - } + WARN_ON_ONCE(!CPU_IS_040_OR_060); } return 0; } @@ -107,11 +90,9 @@ void flush_icache_user_range(unsigned long address, unsigned long endaddr) void flush_icache_range(unsigned long address, unsigned long endaddr) { - mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); + set_fc(SUPER_DATA); flush_icache_user_range(address, endaddr); - set_fs(old_fs); + set_fc(USER_DATA); } EXPORT_SYMBOL(flush_icache_range); diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 5d749e188246..1b47bec15832 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -72,12 +72,6 @@ void __init paging_init(void) if (!empty_zero_page) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_SIZE, PAGE_SIZE); - - /* - * Set up SFC/DFC registers (user data space). - */ - set_fs (USER_DS); - max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 1269d513b221..20ddf71b43d0 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c @@ -17,7 +17,6 @@ #include <linux/vmalloc.h> #include <asm/setup.h> -#include <asm/segment.h> #include <asm/page.h> #include <asm/io.h> #include <asm/tlbflush.h> diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index fe75aecfb238..c2c03b0a1567 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -15,7 +15,6 @@ #include <linux/gfp.h> #include <asm/setup.h> -#include <asm/segment.h> #include <asm/page.h> #include <asm/traps.h> #include <asm/machdep.h> diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 3a653f0a4188..9f3f77785aa7 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -467,7 +467,7 @@ void __init paging_init(void) /* * Set up SFC/DFC registers */ - set_fs(KERNEL_DS); + set_fc(USER_DATA); #ifdef DEBUG printk ("before free_area_init\n"); diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index f7dd47232b6c..203f428a0344 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -31,7 +31,6 @@ #include <asm/intersil.h> #include <asm/irq.h> #include <asm/sections.h> -#include <asm/segment.h> #include <asm/sun3ints.h> char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; @@ -89,7 +88,7 @@ void __init sun3_init(void) sun3_reserved_pmeg[249] = 1; sun3_reserved_pmeg[252] = 1; sun3_reserved_pmeg[253] = 1; - set_fs(KERNEL_DS); + set_fc(USER_DATA); } /* Without this, Bad Things happen when something calls arch_reset. */ diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c index 7aa879b7c7ff..7ec20817c0c9 100644 --- a/arch/m68k/sun3/mmu_emu.c +++ b/arch/m68k/sun3/mmu_emu.c @@ -23,7 +23,6 @@ #include <linux/uaccess.h> #include <asm/page.h> #include <asm/sun3mmu.h> -#include <asm/segment.h> #include <asm/oplib.h> #include <asm/mmu_context.h> #include <asm/dvma.h> @@ -191,14 +190,13 @@ void __init mmu_emu_init(unsigned long bootmem_end) for(seg = 0; seg < PAGE_OFFSET; seg += SUN3_PMEG_SIZE) sun3_put_segmap(seg, SUN3_INVALID_PMEG); - set_fs(MAKE_MM_SEG(3)); + set_fc(3); for(seg = 0; seg < 0x10000000; seg += SUN3_PMEG_SIZE) { i = sun3_get_segmap(seg); for(j = 1; j < CONTEXTS_NUM; j++) (*(romvec->pv_setctxt))(j, (void *)seg, i); } - set_fs(KERNEL_DS); - + set_fc(USER_DATA); } /* erase the mappings for a dead context. Uses the pg_dir for hints diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 41ae422119d3..36cc280a4505 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -11,7 +11,6 @@ #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/interrupt.h> -#include <asm/segment.h> #include <asm/intersil.h> #include <asm/oplib.h> #include <asm/sun3ints.h> diff --git a/arch/m68k/sun3x/prom.c b/arch/m68k/sun3x/prom.c index 74d2fe57524b..64c23bfaa90c 100644 --- a/arch/m68k/sun3x/prom.c +++ b/arch/m68k/sun3x/prom.c @@ -14,7 +14,6 @@ #include <asm/traps.h> #include <asm/sun3xprom.h> #include <asm/idprom.h> -#include <asm/segment.h> #include <asm/sun3ints.h> #include <asm/openprom.h> #include <asm/machines.h> diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0af88622c619..cb6d22439f71 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -662,6 +662,11 @@ static void build_epilogue(struct jit_ctx *ctx) ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative : func) : \ func##_positive) +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + static int build_body(struct jit_ctx *ctx) { const struct bpf_prog *prog = ctx->skf; @@ -728,7 +733,10 @@ load_common: /* Load return register on DS for failures */ emit_reg_move(r_ret, r_zero, ctx); /* Return with error */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_LD | BPF_W | BPF_IND: @@ -775,8 +783,10 @@ load_ind: emit_jalr(MIPS_R_RA, r_s0, ctx); emit_reg_move(MIPS_R_A0, r_skb, ctx); /* delay slot */ /* Check the error value */ - emit_bcond(MIPS_COND_NE, r_ret, 0, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_NE, r_ret, 0, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); /* We are good */ /* X <- P[1:K] & 0xf */ @@ -855,8 +865,10 @@ load_ind: /* A /= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_div(r_A, r_X, ctx); break; @@ -864,8 +876,10 @@ load_ind: /* A %= X */ ctx->flags |= SEEN_X | SEEN_A; /* Check if r_X is zero */ - emit_bcond(MIPS_COND_EQ, r_X, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_X, r_zero, b_off, ctx); emit_load_imm(r_ret, 0, ctx); /* delay slot */ emit_mod(r_A, r_X, ctx); break; @@ -926,7 +940,10 @@ load_ind: break; case BPF_JMP | BPF_JA: /* pc += K */ - emit_b(b_imm(i + k + 1, ctx), ctx); + b_off = b_imm(i + k + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); break; case BPF_JMP | BPF_JEQ | BPF_K: @@ -1056,12 +1073,16 @@ jmp_cmp: break; case BPF_RET | BPF_A: ctx->flags |= SEEN_A; - if (i != prog->len - 1) + if (i != prog->len - 1) { /* * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); + } emit_reg_move(r_ret, r_A, ctx); /* delay slot */ break; case BPF_RET | BPF_K: @@ -1075,7 +1096,10 @@ jmp_cmp: * If this is not the last instruction * then jump to the epilogue */ - emit_b(b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_b(b_off, ctx); emit_nop(ctx); } break; @@ -1133,8 +1157,10 @@ jmp_cmp: /* Load *dev pointer */ emit_load_ptr(r_s0, r_skb, off, ctx); /* error (0) in the delay slot */ - emit_bcond(MIPS_COND_EQ, r_s0, r_zero, - b_imm(prog->len, ctx), ctx); + b_off = b_imm(prog->len, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_bcond(MIPS_COND_EQ, r_s0, r_zero, b_off, ctx); emit_reg_move(r_ret, r_zero, ctx); if (code == (BPF_ANC | SKF_AD_IFINDEX)) { BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4); @@ -1244,7 +1270,10 @@ void bpf_jit_compile(struct bpf_prog *fp) /* Generate the actual JIT code */ build_prologue(&ctx); - build_body(&ctx); + if (build_body(&ctx)) { + module_memfree(ctx.target); + goto out; + } build_epilogue(&ctx); /* Update the icache */ diff --git a/arch/nios2/Kconfig.debug b/arch/nios2/Kconfig.debug index a8bc06e96ef5..ca1beb87f987 100644 --- a/arch/nios2/Kconfig.debug +++ b/arch/nios2/Kconfig.debug @@ -3,9 +3,10 @@ config EARLY_PRINTK bool "Activate early kernel debugging" default y + depends on TTY select SERIAL_CORE_CONSOLE help - Enable early printk on console + Enable early printk on console. This is useful for kernel debugging when your machine crashes very early before the console code is initialized. You should normally say N here, unless you want to debug such a crash. diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index cf8d687a2644..40bc8fb75e0b 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -149,8 +149,6 @@ static void __init find_limits(unsigned long *min, unsigned long *max_low, void __init setup_arch(char **cmdline_p) { - int dram_start; - console_verbose(); memory_start = memblock_start_of_DRAM(); diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h index 36dbf5043fc0..aa995d91cd1d 100644 --- a/arch/s390/include/asm/ccwgroup.h +++ b/arch/s390/include/asm/ccwgroup.h @@ -55,7 +55,7 @@ int ccwgroup_create_dev(struct device *root, struct ccwgroup_driver *gdrv, int num_devices, const char *buf); extern int ccwgroup_set_online(struct ccwgroup_device *gdev); -extern int ccwgroup_set_offline(struct ccwgroup_device *gdev); +int ccwgroup_set_offline(struct ccwgroup_device *gdev, bool call_gdrv); extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 88419263a89a..840d8594437d 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -248,8 +248,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - /* Branch instruction needs 6 bytes */ \ - int rel = (addrs[(i) + (off) + 1] - (addrs[(i) + 1] - 6)) / 2;\ + int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ @@ -761,10 +760,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9080000, dst_reg, src_reg); break; case BPF_ALU | BPF_ADD | BPF_K: /* dst = (u32) dst + (u32) imm */ - if (!imm) - break; - /* alfi %dst,imm */ - EMIT6_IMM(0xc20b0000, dst_reg, imm); + if (imm != 0) { + /* alfi %dst,imm */ + EMIT6_IMM(0xc20b0000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ADD | BPF_K: /* dst = dst + imm */ @@ -786,17 +785,22 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9090000, dst_reg, src_reg); break; case BPF_ALU | BPF_SUB | BPF_K: /* dst = (u32) dst - (u32) imm */ - if (!imm) - break; - /* alfi %dst,-imm */ - EMIT6_IMM(0xc20b0000, dst_reg, -imm); + if (imm != 0) { + /* alfi %dst,-imm */ + EMIT6_IMM(0xc20b0000, dst_reg, -imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_SUB | BPF_K: /* dst = dst - imm */ if (!imm) break; - /* agfi %dst,-imm */ - EMIT6_IMM(0xc2080000, dst_reg, -imm); + if (imm == -0x80000000) { + /* algfi %dst,0x80000000 */ + EMIT6_IMM(0xc20a0000, dst_reg, 0x80000000); + } else { + /* agfi %dst,-imm */ + EMIT6_IMM(0xc2080000, dst_reg, -imm); + } break; /* * BPF_MUL @@ -811,10 +815,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb90c0000, dst_reg, src_reg); break; case BPF_ALU | BPF_MUL | BPF_K: /* dst = (u32) dst * (u32) imm */ - if (imm == 1) - break; - /* msfi %r5,imm */ - EMIT6_IMM(0xc2010000, dst_reg, imm); + if (imm != 1) { + /* msfi %r5,imm */ + EMIT6_IMM(0xc2010000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_MUL | BPF_K: /* dst = dst * imm */ @@ -867,6 +871,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, if (BPF_OP(insn->code) == BPF_MOD) /* lhgi %dst,0 */ EMIT4_IMM(0xa7090000, dst_reg, 0); + else + EMIT_ZERO(dst_reg); break; } /* lhi %w0,0 */ @@ -999,10 +1005,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4(0xb9820000, dst_reg, src_reg); break; case BPF_ALU | BPF_XOR | BPF_K: /* dst = (u32) dst ^ (u32) imm */ - if (!imm) - break; - /* xilf %dst,imm */ - EMIT6_IMM(0xc0070000, dst_reg, imm); + if (imm != 0) { + /* xilf %dst,imm */ + EMIT6_IMM(0xc0070000, dst_reg, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_XOR | BPF_K: /* dst = dst ^ imm */ @@ -1033,10 +1039,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000d, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_LSH | BPF_K: /* dst = (u32) dst << (u32) imm */ - if (imm == 0) - break; - /* sll %dst,imm(%r0) */ - EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sll %dst,imm(%r0) */ + EMIT4_DISP(0x89000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_LSH | BPF_K: /* dst = dst << imm */ @@ -1058,10 +1064,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000c, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_RSH | BPF_K: /* dst = (u32) dst >> (u32) imm */ - if (imm == 0) - break; - /* srl %dst,imm(%r0) */ - EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* srl %dst,imm(%r0) */ + EMIT4_DISP(0x88000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_RSH | BPF_K: /* dst = dst >> imm */ @@ -1083,10 +1089,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT6_DISP_LH(0xeb000000, 0x000a, dst_reg, dst_reg, src_reg, 0); break; case BPF_ALU | BPF_ARSH | BPF_K: /* ((s32) dst >> imm */ - if (imm == 0) - break; - /* sra %dst,imm(%r0) */ - EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + if (imm != 0) { + /* sra %dst,imm(%r0) */ + EMIT4_DISP(0x8a000000, dst_reg, REG_0, imm); + } EMIT_ZERO(dst_reg); break; case BPF_ALU64 | BPF_ARSH | BPF_K: /* ((s64) dst) >>= imm */ diff --git a/arch/sh/include/asm/pgtable-3level.h b/arch/sh/include/asm/pgtable-3level.h index 56bf35c2f29c..cdced80a7ffa 100644 --- a/arch/sh/include/asm/pgtable-3level.h +++ b/arch/sh/include/asm/pgtable-3level.h @@ -34,7 +34,7 @@ typedef struct { unsigned long long pmd; } pmd_t; static inline pmd_t *pud_pgtable(pud_t pud) { - return (pmd_t *)pud_val(pud); + return (pmd_t *)(unsigned long)pud_val(pud); } /* only used by the stubbed out hugetlb gup code, should never be called */ diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c index c9da9f139694..f3a8cd491ce0 100644 --- a/arch/sparc/lib/iomap.c +++ b/arch/sparc/lib/iomap.c @@ -19,8 +19,10 @@ void ioport_unmap(void __iomem *addr) EXPORT_SYMBOL(ioport_map); EXPORT_SYMBOL(ioport_unmap); +#ifdef CONFIG_PCI void pci_iounmap(struct pci_dev *dev, void __iomem * addr) { /* nothing to do */ } EXPORT_SYMBOL(pci_iounmap); +#endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dad7f85dcdea..ab83c22d274e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2610,7 +2610,6 @@ config PCI_OLPC config PCI_XEN def_bool y depends on PCI && XEN - select SWIOTLB_XEN config MMCONF_FAM10H def_bool y diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S index fa2c3f50aecb..18d2f5199194 100644 --- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S +++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S @@ -367,10 +367,11 @@ SYM_FUNC_START(sm4_aesni_avx_crypt8) * %rdx: src (1..8 blocks) * %rcx: num blocks (1..8) */ - FRAME_BEGIN - cmpq $5, %rcx; jb sm4_aesni_avx_crypt4; + + FRAME_BEGIN + vmovdqu (0 * 16)(%rdx), RA0; vmovdqu (1 * 16)(%rdx), RA1; vmovdqu (2 * 16)(%rdx), RA2; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 2a57dbed4894..6dfa8ddaa60f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,6 +2465,7 @@ static int x86_pmu_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7011e87be6d0..9a044438072b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -263,6 +263,7 @@ static struct event_constraint intel_icl_event_constraints[] = { INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT(0xef, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 5c7bcaa79623..1d5f14aff5f6 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -2,8 +2,6 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H -#define ARCH_DEFAULT_PKEY 0 - /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index f3fbb84ff8a7..68c257a3de0d 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -275,7 +275,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; - int zf; + bool zf; /* * ENQCMDS %(rdx), rax diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 6b56d0d45d15..66b4ddde7743 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,14 +3,10 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int xen_swiotlb; extern int __init pci_xen_swiotlb_detect(void); -extern void __init pci_xen_swiotlb_init(void); extern int pci_xen_swiotlb_init_late(void); #else -#define xen_swiotlb (0) -static inline int __init pci_xen_swiotlb_detect(void) { return 0; } -static inline void __init pci_xen_swiotlb_init(void) { } +#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 79f164141116..40ed44ead063 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -830,6 +830,20 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + /* + * Do some memory reservations *before* memory is added to memblock, so + * memblock allocations won't overwrite it. + * + * After this point, everything still needed from the boot loader or + * firmware or kernel text should be early reserved or marked not RAM in + * e820. All other memory is free game. + * + * This call needs to happen before e820__memory_setup() which calls the + * xen_memory_setup() on Xen dom0 which relies on the fact that those + * early reservations have happened already. + */ + early_reserve_memory(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; e820__memory_setup(); parse_setup_data(); @@ -876,18 +890,6 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - /* - * Do some memory reservations *before* memory is added to - * memblock, so memblock allocations won't overwrite it. - * Do it after early param, so we could get (unlikely) panic from - * serial. - * - * After this point everything still needed from the boot loader or - * firmware or kernel text should be early reserved or marked not - * RAM in e820. All other memory is free game. - */ - early_reserve_memory(); - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory used by the kernel cannot be hot-removed because Linux diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 532791ffcbb9..28b1a4e57827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -435,7 +435,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop); __FOP_RET(#op) asm(".pushsection .fixup, \"ax\"\n" - ".global kvm_fastop_exception \n" "kvm_fastop_exception: xor %esi, %esi; ret\n" ".popsection"); diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 058f19b20465..c565def611e2 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -37,10 +37,10 @@ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ - ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, insn->next_byte, sizeof(t)); insn->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ - ({ t r = *(t*)((insn)->next_byte + n); leXX_to_cpu(t, r); }) + ({ t r; memcpy(&r, (insn)->next_byte + n, sizeof(t)); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b2eefdefc108..84a2c8c4af73 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,7 +710,8 @@ oops: static noinline void kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, - unsigned long address, int signal, int si_code) + unsigned long address, int signal, int si_code, + u32 pkey) { WARN_ON_ONCE(user_mode(regs)); @@ -735,8 +736,12 @@ kernelmode_fixup_or_oops(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); - /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address); + if (si_code == SEGV_PKUERR) { + force_sig_pkuerr((void __user *)address, pkey); + } else { + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_fault(signal, si_code, (void __user *)address); + } } /* @@ -798,7 +803,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, struct task_struct *tsk = current; if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, pkey, si_code); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGSEGV, si_code, pkey); return; } @@ -930,7 +936,8 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, { /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, SIGBUS, BUS_ADRERR); + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, ARCH_DEFAULT_PKEY); return; } @@ -1396,7 +1403,8 @@ good_area: */ if (!user_mode(regs)) kernelmode_fixup_or_oops(regs, error_code, address, - SIGBUS, BUS_ADRERR); + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); return; } @@ -1416,7 +1424,8 @@ good_area: return; if (fatal_signal_pending(current) && !user_mode(regs)) { - kernelmode_fixup_or_oops(regs, error_code, address, 0, 0); + kernelmode_fixup_or_oops(regs, error_code, address, + 0, 0, ARCH_DEFAULT_PKEY); return; } @@ -1424,7 +1433,8 @@ good_area: /* Kernel mode? Handle exceptions or die: */ if (!user_mode(regs)) { kernelmode_fixup_or_oops(regs, error_code, address, - SIGSEGV, SEGV_MAPERR); + SIGSEGV, SEGV_MAPERR, + ARCH_DEFAULT_PKEY); return; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0fe6aacef3db..9ea57389c554 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1341,9 +1341,10 @@ st: if (is_imm8(insn->off)) if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) { - u8 *branch_target; bool is64 = BPF_SIZE(insn->code) == BPF_DW; u32 real_src_reg = src_reg; + u32 real_dst_reg = dst_reg; + u8 *branch_target; /* * Can't be implemented with a single x86 insn. @@ -1354,11 +1355,13 @@ st: if (is_imm8(insn->off)) emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); if (src_reg == BPF_REG_0) real_src_reg = BPF_REG_AX; + if (dst_reg == BPF_REG_0) + real_dst_reg = BPF_REG_AX; branch_target = prog; /* Load old value */ emit_ldx(&prog, BPF_SIZE(insn->code), - BPF_REG_0, dst_reg, insn->off); + BPF_REG_0, real_dst_reg, insn->off); /* * Perform the (commutative) operation locally, * put the result in the AUX_REG. @@ -1369,7 +1372,8 @@ st: if (is_imm8(insn->off)) add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ err = emit_atomic(&prog, BPF_CMPXCHG, - dst_reg, AUX_REG, insn->off, + real_dst_reg, AUX_REG, + insn->off, BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; @@ -1383,11 +1387,10 @@ st: if (is_imm8(insn->off)) /* Restore R0 after clobbering RAX */ emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); break; - } err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; @@ -1744,7 +1747,7 @@ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, } static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, - struct bpf_prog *p, int stack_size, bool mod_ret) + struct bpf_prog *p, int stack_size, bool save_ret) { u8 *prog = *pprog; u8 *jmp_insn; @@ -1777,11 +1780,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (emit_call(&prog, p->bpf_func, prog)) return -EINVAL; - /* BPF_TRAMP_MODIFY_RETURN trampolines can modify the return + /* + * BPF_TRAMP_MODIFY_RETURN trampolines can modify the return * of the previous call which is then passed on the stack to * the next BPF program. + * + * BPF_TRAMP_FENTRY trampoline may need to return the return + * value of BPF_PROG_TYPE_STRUCT_OPS prog. */ - if (mod_ret) + if (save_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); /* replace 2 nops with JE insn, since jmp target is known */ @@ -1828,13 +1835,15 @@ static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond) } static int invoke_bpf(const struct btf_func_model *m, u8 **pprog, - struct bpf_tramp_progs *tp, int stack_size) + struct bpf_tramp_progs *tp, int stack_size, + bool save_ret) { int i; u8 *prog = *pprog; for (i = 0; i < tp->nr_progs; i++) { - if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, false)) + if (invoke_bpf_prog(m, &prog, tp->progs[i], stack_size, + save_ret)) return -EINVAL; } *pprog = prog; @@ -1877,6 +1886,23 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, return 0; } +static bool is_valid_bpf_tramp_flags(unsigned int flags) +{ + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return false; + + /* + * BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops, + * and it must be used alone. + */ + if ((flags & BPF_TRAMP_F_RET_FENTRY_RET) && + (flags & ~BPF_TRAMP_F_RET_FENTRY_RET)) + return false; + + return true; +} + /* Example: * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); * its 'struct btf_func_model' will be nr_args=2 @@ -1949,17 +1975,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; u8 **branches = NULL; u8 *prog; + bool save_ret; /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ if (nr_args > 6) return -ENOTSUPP; - if ((flags & BPF_TRAMP_F_RESTORE_REGS) && - (flags & BPF_TRAMP_F_SKIP_FRAME)) + if (!is_valid_bpf_tramp_flags(flags)) return -EINVAL; - if (flags & BPF_TRAMP_F_CALL_ORIG) - stack_size += 8; /* room for return value of orig_call */ + /* room for return value of orig_call or fentry prog */ + save_ret = flags & (BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_RET_FENTRY_RET); + if (save_ret) + stack_size += 8; if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -2005,7 +2033,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fentry->nr_progs) - if (invoke_bpf(m, &prog, fentry, stack_size)) + if (invoke_bpf(m, &prog, fentry, stack_size, + flags & BPF_TRAMP_F_RET_FENTRY_RET)) return -EINVAL; if (fmod_ret->nr_progs) { @@ -2052,7 +2081,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i } if (fexit->nr_progs) - if (invoke_bpf(m, &prog, fexit, stack_size)) { + if (invoke_bpf(m, &prog, fexit, stack_size, false)) { ret = -EINVAL; goto cleanup; } @@ -2072,9 +2101,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i ret = -EINVAL; goto cleanup; } - /* restore original return value back into RAX */ - emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); } + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 349f780a1567..6e0d0754f94f 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -755,8 +755,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) preempt_enable(); } -static void xen_convert_trap_info(const struct desc_ptr *desc, - struct trap_info *traps) +static unsigned xen_convert_trap_info(const struct desc_ptr *desc, + struct trap_info *traps, bool full) { unsigned in, out, count; @@ -766,17 +766,18 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, for (in = out = 0; in < count; in++) { gate_desc *entry = (gate_desc *)(desc->address) + in; - if (cvt_gate_to_trap(in, entry, &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out]) || full) out++; } - traps[out].address = 0; + + return out; } void xen_copy_trap_info(struct trap_info *traps) { const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); - xen_convert_trap_info(desc, traps); + xen_convert_trap_info(desc, traps, true); } /* Load a new IDT into Xen. In principle this can be per-CPU, so we @@ -786,6 +787,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + unsigned out; trace_xen_cpu_load_idt(desc); @@ -793,7 +795,8 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); - xen_convert_trap_info(desc, traps); + out = xen_convert_trap_info(desc, traps, false); + memset(&traps[out], 0, sizeof(traps[0])); xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 54f9aa7e8457..46df59aeaa06 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,7 +18,7 @@ #endif #include <linux/export.h> -int xen_swiotlb __read_mostly; +static int xen_swiotlb __read_mostly; /* * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary @@ -56,7 +56,7 @@ int __init pci_xen_swiotlb_detect(void) return xen_swiotlb; } -void __init pci_xen_swiotlb_init(void) +static void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { xen_swiotlb_init_early(); diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 96afadf9878e..7ed56c6075b0 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -290,8 +290,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_rw(cpu); - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - /* * Bring up the CPU in cpu_bringup_and_idle() with the stack * pointing just below where pt_regs would be if it were a normal @@ -308,8 +306,6 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); gdt_mfn = arbitrary_virt_to_mfn(gdt); |