diff options
65 files changed, 1118 insertions, 565 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 07e87a7c665d..cd209f7730af 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the flag KVM_VM_MIPS_VZ. +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST diff --git a/MAINTAINERS b/MAINTAINERS index 1610fb26bdac..86e019c7b0fa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12260,6 +12260,7 @@ F: Documentation/networking/rds.txt RDT - RESOURCE ALLOCATION M: Fenghua Yu <fenghua.yu@intel.com> +M: Reinette Chatre <reinette.chatre@intel.com> L: linux-kernel@vger.kernel.org S: Supported F: arch/x86/kernel/cpu/intel_rdt* @@ -15924,6 +15925,7 @@ F: net/x25/ X86 ARCHITECTURE (32-BIT AND 64-BIT) M: Thomas Gleixner <tglx@linutronix.de> M: Ingo Molnar <mingo@redhat.com> +M: Borislav Petkov <bp@alien8.de> R: "H. Peter Anvin" <hpa@zytor.com> M: x86@kernel.org L: linux-kernel@vger.kernel.org @@ -15952,6 +15954,15 @@ M: Borislav Petkov <bp@alien8.de> S: Maintained F: arch/x86/kernel/cpu/microcode/* +X86 MM +M: Dave Hansen <dave.hansen@linux.intel.com> +M: Andy Lutomirski <luto@kernel.org> +M: Peter Zijlstra <peterz@infradead.org> +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/mm +S: Maintained +F: arch/x86/mm/ + X86 PLATFORM DRIVERS M: Darren Hart <dvhart@infradead.org> M: Andy Shevchenko <andy@infradead.org> @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 19 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Merciless Moray # *DOCUMENTATION* diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index 3ab8b3781bfe..c3f1f9b304b7 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -133,8 +133,7 @@ * space. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) /* Virtualization Translation Control Register (VTCR) bits */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 3ad482d2f1eb..5ca5d9af0c26 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } @@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + /* + * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, + * so any non-zero value used as type is illegal. + */ + if (type) + return -EINVAL; + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 265ea9cf7df7..5ad1a54f98dc 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -35,16 +35,12 @@ addr; \ }) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. - */ -#define KVM_MMU_CACHE_MIN_PAGES 2 - #ifndef __ASSEMBLY__ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/pgalloc.h> #include <asm/stage2_pgtable.h> @@ -52,6 +48,13 @@ /* Ensure compatibility with arm64 */ #define VA_BITS 32 +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK + +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) + int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, @@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) (addr) +static inline void kvm_set_ipa_limit(void) {} + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 460d616bb2d6..f6a7ea805232 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -19,43 +19,53 @@ #ifndef __ARM_S2_PGTABLE_H_ #define __ARM_S2_PGTABLE_H_ -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) - -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) - -#define stage2_pud_huge(pud) pud_huge(pud) +/* + * kvm_mmu_cache_min_pages() is the number of pages required + * to install a stage-2 translation. We pre-allocate the entry + * level table at VM creation. Since we have a 3 level page-table, + * we need only two pages to add a new mapping. + */ +#define kvm_mmu_cache_min_pages(kvm) 2 + +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) + +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(kvm, pud) pud_huge(pud) /* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_addr_end(kvm, addr, end) (end) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pgd_index(addr) pgd_index(addr) +#define stage2_pgd_index(kvm, addr) pgd_index(addr) -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(pudp) false +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(kvm, pudp) false #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 1717ba1db35d..072cc1c970c2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -530,6 +530,26 @@ void arm64_set_ssbd_mitigation(bool state); static inline void arm64_set_ssbd_mitigation(bool state) {} #endif +static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +{ + switch (parange) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: return 48; + case 6: return 52; + /* + * A future PE could use a value unknown to the kernel. + * However, by the "D10.1.4 Principles of the ID scheme + * for fields in ID registers", ARM DDI 0487C.a, any new + * value is guaranteed to be higher than what we know already. + * As a safe limit, we return the limit supported by the kernel. + */ + default: return CONFIG_ARM64_PA_BITS; + } +} #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index aa45df752a16..6e324d1f1231 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ #define VTCR_EL2_RES1 (1 << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K @@ -120,62 +121,149 @@ #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA #define VTCR_EL2_SL0_SHIFT 6 #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) -#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) +#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) + /* * We configure the Stage-2 page tables to always restrict the IPA space to be * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time - * (see hyp-init.S). + * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. * - * The magic numbers used for VTTBR_X in this patch can be found in Tables - * D4-23 and D4-25 in ARM DDI 0487A.b. */ -#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) -#ifdef CONFIG_ARM64_64K_PAGES /* - * Stage2 translation configuration: - * 64kB pages (TG0 = 1) - * 2 level page tables (SL = 1) + * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. + * Interestingly, it depends on the page size. + * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a + * + * ----------------------------------------- + * | Entry level | 4K | 16K/64K | + * ------------------------------------------ + * | Level: 0 | 2 | - | + * ------------------------------------------ + * | Level: 1 | 1 | 2 | + * ------------------------------------------ + * | Level: 2 | 0 | 1 | + * ------------------------------------------ + * | Level: 3 | - | 0 | + * ------------------------------------------ + * + * The table roughly translates to : + * + * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level + * + * Where TGRAN_SL0_BASE is a magic number depending on the page size: + * TGRAN_SL0_BASE(4K) = 2 + * TGRAN_SL0_BASE(16K) = 3 + * TGRAN_SL0_BASE(64K) = 3 + * provided we take care of ruling out the unsupported cases and + * Entry_Level = 4 - Number_of_levels. + * */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 38 +#ifdef CONFIG_ARM64_64K_PAGES + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #elif defined(CONFIG_ARM64_16K_PAGES) -/* - * Stage2 translation configuration: - * 16kB pages (TG0 = 2) - * 2 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 42 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #else /* 4K */ -/* - * Stage2 translation configuration: - * 4kB pages (TG0 = 0) - * 3 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 37 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K +#define VTCR_EL2_TGRAN_SL0_BASE 2UL + #endif -#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) -#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) +#define VTCR_EL2_LVLS_TO_SL0(levels) \ + ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_TO_LVLS(sl0) \ + ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) +#define VTCR_EL2_LVLS(vtcr) \ + VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) +#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) + +/* + * ARM VMSAv8-64 defines an algorithm for finding the translation table + * descriptors in section D4.2.8 in ARM DDI 0487C.a. + * + * The algorithm defines the expectations on the translation table + * addresses for each level, based on PAGE_SIZE, entry level + * and the translation table size (T0SZ). The variable "x" in the + * algorithm determines the alignment of a table base address at a given + * level and thus determines the alignment of VTTBR:BADDR for stage2 + * page table entry level. + * Since the number of bits resolved at the entry level could vary + * depending on the T0SZ, the value of "x" is defined based on a + * Magic constant for a given PAGE_SIZE and Entry Level. The + * intermediate levels must be always aligned to the PAGE_SIZE (i.e, + * x = PAGE_SHIFT). + * + * The value of "x" for entry level is calculated as : + * x = Magic_N - T0SZ + * + * where Magic_N is an integer depending on the page size and the entry + * level of the page table as below: + * + * -------------------------------------------- + * | Entry level | 4K 16K 64K | + * -------------------------------------------- + * | Level: 0 (4 levels) | 28 | - | - | + * -------------------------------------------- + * | Level: 1 (3 levels) | 37 | 31 | 25 | + * -------------------------------------------- + * | Level: 2 (2 levels) | 46 | 42 | 38 | + * -------------------------------------------- + * | Level: 3 (1 level) | - | 53 | 51 | + * -------------------------------------------- + * + * We have a magic formula for the Magic_N below: + * + * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels) + * + * where Number_of_levels = (4 - Level). We are only interested in the + * value for Entry_Level for the stage2 page table. + * + * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows: + * + * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT) + * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels) + * + * Here is one way to explain the Magic Formula: + * + * x = log2(Size_of_Entry_Level_Table) + * + * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another + * PAGE_SHIFT bits in the PTE, we have : + * + * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT) + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3 + * where n = number of levels, and since each pointer is 8bytes, we have: + * + * x = Bits_Entry_Level + 3 + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n + * + * The only constraint here is that, we have to find the number of page table + * levels for a given IPA size (which we do, see stage2_pt_levels()) + */ +#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3))) -#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X) #define VTTBR_VMID_SHIFT (UL(48)) #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) @@ -223,6 +311,13 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +/* + * We have + * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] + * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + */ +#define PAR_TO_HPFAR(par) \ + (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) #define kvm_arm_exception_type \ {0, "IRQ" }, \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 102b5a5c47b6..aea01a09eb94 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -30,6 +30,7 @@ #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 +#define ARM_EXCEPTION_IL 3 /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR @@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -extern u32 __init_stage2_translation(void); - /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ #define __hyp_this_cpu_ptr(sym) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3d6d7336f871..f84052f306af 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { @@ -61,11 +61,13 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table, protected by kvm->mmu_lock */ + /* stage2 entry level table */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* VTCR_EL2 value for this VM */ + u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -440,13 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) -{ - u32 parange = kvm_call_hyp(__init_stage2_translation); - - WARN_ONCE(parange < 40, - "PARange is %d bits, unsupported configuration!", parange); -} +static inline void __cpu_init_stage2(void) {} /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); @@ -509,8 +505,12 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); +void kvm_set_ipa_limit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 384c34397619..23aca66767f9 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vtcr, vtcr_el2); + write_sysreg(kvm->arch.vttbr, vttbr_el2); +} + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index d6fff7de5539..77b1af9e64db 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v) * We currently only support a 40bit IPA. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) + +#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) +#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) + +static inline bool kvm_page_empty(void *ptr) +{ + struct page *ptr_page = virt_to_page(ptr); + return page_count(ptr_page) == 1; +} #include <asm/stage2_pgtable.h> @@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED @@ -517,5 +519,29 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. + * With v8.2 LVA extensions, 'x' should be a minimum of 6 with + * 52bit IPS. + */ +static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) +{ + int x = ARM64_VTTBR_X(ipa_shift, levels); + + return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; +} + +static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) +{ + unsigned int x = arm64_vttbr_x(ipa_shift, levels); + + return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); +} + +static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) +{ + return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); +} + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 177b851ca6d9..ff35ac1258eb 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -25,6 +25,9 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +/* Additional SPSR bits not exposed in the UABI */ +#define PSR_IL_BIT (1 << 20) + /* AArch32-specific ptrace requests */ #define COMPAT_PTRACE_GETREGS 12 #define COMPAT_PTRACE_SETREGS 13 diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h deleted file mode 100644 index 2656a0fd05a6..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ -#define __ARM64_S2_PGTABLE_NOPMD_H_ - -#include <asm/stage2_pgtable-nopud.h> - -#define __S2_PGTABLE_PMD_FOLDED - -#define S2_PMD_SHIFT S2_PUD_SHIFT -#define S2_PTRS_PER_PMD 1 -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) - -#define stage2_pud_none(pud) (0) -#define stage2_pud_present(pud) (1) -#define stage2_pud_clear(pud) do { } while (0) -#define stage2_pud_populate(pud, pmd) do { } while (0) -#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) - -#define stage2_pmd_free(pmd) do { } while (0) - -#define stage2_pmd_addr_end(addr, end) (end) - -#define stage2_pud_huge(pud) (0) -#define stage2_pmd_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h deleted file mode 100644 index 5ee87b54ebf3..000000000000 --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ -#define __ARM64_S2_PGTABLE_NOPUD_H_ - -#define __S2_PGTABLE_PUD_FOLDED - -#define S2_PUD_SHIFT S2_PGDIR_SHIFT -#define S2_PTRS_PER_PUD 1 -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) - -#define stage2_pgd_none(pgd) (0) -#define stage2_pgd_present(pgd) (1) -#define stage2_pgd_clear(pgd) do { } while (0) -#define stage2_pgd_populate(pgd, pud) do { } while (0) - -#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) - -#define stage2_pud_free(x) do { } while (0) - -#define stage2_pud_addr_end(addr, end) (end) -#define stage2_pud_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 8b68099348e5..d352f6df8d2c 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -19,9 +19,17 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ +#include <linux/hugetlb.h> #include <asm/pgtable.h> /* + * PGDIR_SHIFT determines the size a top-level page table entry can map + * and depends on the number of levels in the page table. Compute the + * PGDIR_SHIFT for a given number of levels. + */ +#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) + +/* * The hardware supports concatenation of up to 16 tables at stage2 entry level * and we use the feature whenever possible. * @@ -29,112 +37,208 @@ * On arm64, the smallest PAGE_SIZE supported is 4k, which means * (PAGE_SHIFT - 3) > 4 holds for all page sizes. * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) * in normal translations(e.g, stage1), since we cannot have another level in - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + * the range (IPA_SHIFT, IPA_SHIFT - 4). */ -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) +#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) +#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* - * With all the supported VA_BITs and 40bit guest IPA, the following condition - * is always true: - * - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS - * - * We base our stage-2 page table walker helpers on this assumption and - * fall back to using the host version of the helper wherever possible. - * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back - * to using the host version, since it is guaranteed it is not folded at host. - * - * If the condition breaks in the future, we can rearrange the host level - * definitions and reuse them for stage2. Till then... - */ -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS -#error "Unsupported combination of guest IPA and host VA_BITS." -#endif - -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) +/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ +#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) +#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) +#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* * The number of PTRS across all concatenated stage2 tables given by the * number of bits resolved at the initial level. + * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), + * in which case, stage2_pgd_ptrs will have one entry. */ -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) +#define pgd_ptrs_shift(ipa, pgdir_shift) \ + ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) +#define __s2_pgd_ptrs(ipa, lvls) \ + (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) +#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) + +#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) /* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD. + * kvm_mmmu_cache_min_pages() is the number of pages required to install + * a stage-2 translation. We pre-allocate the entry level page table at + * the VM creation. */ -#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) - -#if STAGE2_PGTABLE_LEVELS > 3 +/* Stage2 PUD definitions when the level is present */ +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); +} #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) +static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +{ + if (kvm_stage2_has_pud(kvm)) + return pgd_none(pgd); + else + return 0; +} -#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_clear(pgdp); +} -static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + if (kvm_stage2_has_pud(kvm)) + return pgd_present(pgd); + else + return 1; +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_populate(NULL, pgd, pud); +} + +static inline pud_t *stage2_pud_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + if (kvm_stage2_has_pud(kvm)) + return pud_offset(pgd, address); + else + return (pud_t *)pgd; } -#endif /* STAGE2_PGTABLE_LEVELS > 3 */ +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pud_free(NULL, pud); +} +static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) +{ + if (kvm_stage2_has_pud(kvm)) + return kvm_page_empty(pudp); + else + return false; +} -#if STAGE2_PGTABLE_LEVELS > 2 +static inline phys_addr_t +stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pud(kvm)) { + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} + +/* Stage2 PMD definitions when the level is present */ +static inline bool kvm_stage2_has_pmd(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); +} #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_none(pud); + else + return 0; +} + +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pmd(kvm)) + pud_clear(pud); +} -#define stage2_pud_huge(pud) pud_huge(pud) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_present(pud); + else + return 1; +} -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + if (kvm_stage2_has_pmd(kvm)) + pud_populate(NULL, pud, pmd); +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, + pud_t *pud, unsigned long address) +{ + if (kvm_stage2_has_pmd(kvm)) + return pmd_offset(pud, address); + else + return (pmd_t *)pud; } -#endif /* STAGE2_PGTABLE_LEVELS > 2 */ +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) +{ + if (kvm_stage2_has_pmd(kvm)) + pmd_free(NULL, pmd); +} + +static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_huge(pud); + else + return 0; +} + +static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) +{ + if (kvm_stage2_has_pmd(kvm)) + return kvm_page_empty(pmdp); + else + return 0; +} -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pmd(kvm)) { + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; -#if STAGE2_PGTABLE_LEVELS == 2 -#include <asm/stage2_pgtable-nopmd.h> -#elif STAGE2_PGTABLE_LEVELS == 3 -#include <asm/stage2_pgtable-nopud.h> -#endif + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} +static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) +{ + return kvm_page_empty(ptep); +} -#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) +{ + return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); +} -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); return (boundary - 1 < end - 1) ? boundary : end; } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 07256b08226c..a74f84d09412 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -338,15 +338,15 @@ int __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; - }; + } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_POTENZA: return KVM_ARM_TARGET_XGENE_POTENZA; - }; + } break; - }; + } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e5e741bfffe1..35a81bebd02b 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; + case ARM_EXCEPTION_IL: + /* + * We attempted an illegal exception return. Guest state must + * have been corrupted somehow. Give up. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 2fabc2dc1966..82d1904328ad 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o -obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 24b4fbafe3e4..b1f14f736962 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -162,6 +162,20 @@ el1_error: mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_sync: + /* Check for illegal exception return, otherwise panic */ + mrs x0, spsr_el2 + + /* if this was something else, then panic! */ + tst x0, #PSR_IL_BIT + b.eq __hyp_panic + + /* Let's attempt a recovery from the illegal exception return */ + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IL + b __guest_exit + + el2_error: ldp x0, x1, [sp], #16 @@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector) invalid_vect el2t_fiq_invalid // FIQ EL2t invalid_vect el2t_error_invalid // Error EL2t - invalid_vect el2h_sync_invalid // Synchronous EL2h + valid_vect el2_sync // Synchronous EL2h invalid_vect el2h_irq_invalid // IRQ EL2h invalid_vect el2h_fiq_invalid // FIQ EL2h valid_vect el2_error // Error EL2h diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c deleted file mode 100644 index 603e1ee83e89..000000000000 --- a/arch/arm64/kvm/hyp/s2-setup.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/types.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_hyp.h> - -u32 __hyp_text __init_stage2_translation(void) -{ - u64 val = VTCR_EL2_FLAGS; - u64 parange; - u64 tmp; - - /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS - * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while - * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... - */ - parange = read_sysreg(id_aa64mmfr0_el1) & 7; - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; - val |= parange << 16; - - /* Compute the actual PARange... */ - switch (parange) { - case 0: - parange = 32; - break; - case 1: - parange = 36; - break; - case 2: - parange = 40; - break; - case 3: - parange = 42; - break; - case 4: - parange = 44; - break; - case 5: - default: - parange = 48; - break; - } - - /* - * ... and clamp it to 40 bits, unless we have some braindead - * HW that implements less than that. In all cases, we'll - * return that value for the rest of the kernel to decide what - * to do. - */ - val |= 64 - (parange > 40 ? 40 : parange); - - /* - * Check the availability of Hardware Access Flag / Dirty Bit - * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (tmp) - val |= VTCR_EL2_HA; - - /* - * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS - * bit in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; - val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; - - write_sysreg(val, vtcr_el2); - - return parange; -} diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ca46153d7915..7cc175c88a37 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void) static void __hyp_text __activate_vm(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); } static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) @@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ - *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4; + *hpfar = PAR_TO_HPFAR(tmp); return true; } diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c index 9ce223944983..8dc285318204 100644 --- a/arch/arm64/kvm/hyp/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { + u64 pstate = ctxt->gp_regs.regs.pstate; + u64 mode = pstate & PSR_AA32_MODE_MASK; + + /* + * Safety check to ensure we're setting the CPU up to enter the guest + * in a less privileged mode. + * + * If we are attempting a return to EL2 or higher in AArch64 state, + * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that + * we'll take an illegal exception state exception immediately after + * the ERET to the guest. Attempts to return to AArch32 Hyp will + * result in an illegal exception return because EL2's execution state + * is determined by SCR_EL3.RW. + */ + if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) + pstate = PSR_MODE_EL2h | PSR_IL_BIT; + write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); + write_sysreg_el2(pstate, spsr); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c index 131c7772703c..4dbd9c69a96d 100644 --- a/arch/arm64/kvm/hyp/tlb.c +++ b/arch/arm64/kvm/hyp/tlb.c @@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so * let's flip TGE before executing the TLB operation. */ - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); @@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); isb(); } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index e37c78bbe1ca..b72a3dd56204 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -26,6 +26,7 @@ #include <kvm/arm_arch_timer.h> +#include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> @@ -33,6 +34,9 @@ #include <asm/kvm_coproc.h> #include <asm/kvm_mmu.h> +/* Maximum phys_shift supported for any VM on this host */ +static u32 kvm_ipa_limit; + /* * ARMv8 Reset Values */ @@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void) } /** - * kvm_arch_dev_ioctl_check_extension + * kvm_arch_vm_ioctl_check_extension * * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: - case KVM_CAP_VCPU_EVENTS: r = 1; break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = kvm_ipa_limit; + break; default: r = 0; } @@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu); } + +void kvm_set_ipa_limit(void) +{ + unsigned int ipa_max, pa_max, va_max, parange; + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; + pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + + /* Clamp the IPA limit to the PA size supported by the kernel */ + ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; + /* + * Since our stage2 table is dependent on the stage1 page table code, + * we must always honor the following condition: + * + * Number of levels in Stage1 >= Number of levels in Stage2. + * + * So clamp the ipa limit further down to limit the number of levels. + * Since we can concatenate upto 16 tables at entry level, we could + * go upto 4bits above the maximum VA addressible with the current + * number of levels. + */ + va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; + va_max += 4; + + if (va_max < ipa_max) + ipa_max = va_max; + + /* + * If the final limit is lower than the real physical address + * limit of the CPUs, report the reason. + */ + if (ipa_max < pa_max) + pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", + (va_max < pa_max) ? "Virtual" : "Physical"); + + WARN(ipa_max < KVM_PHYS_SHIFT, + "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); + kvm_ipa_limit = ipa_max; + kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); +} + +/* + * Configure the VTCR_EL2 for this VM. The VTCR value is common + * across all the physical CPUs on the system. We use system wide + * sanitised values to fill in different fields, except for Hardware + * Management of Access Flags. HA Flag is set unconditionally on + * all CPUs, as it is safe to run with or without the feature and + * the bit is RES0 on CPUs that don't support it. + */ +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + u64 vtcr = VTCR_EL2_FLAGS; + u32 parange, phys_shift; + u8 lvls; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < 32) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + } + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; + vtcr |= parange << VTCR_EL2_PS_SHIFT; + + vtcr |= VTCR_EL2_T0SZ(phys_shift); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); + + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. The features is RES0 on CPUs without the support + * and must be ignored by the CPUs. + */ + vtcr |= VTCR_EL2_HA; + + /* Set the vmid bits */ + vtcr |= (kvm_get_vmid_bits() == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + kvm->arch.vtcr = vtcr; + return 0; +} diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index e203169931c7..6390bd8c141b 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,6 +14,16 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +/* + * Exposed to assembly code for setting up initial page tables. Cannot be + * calculated in assembly code (fixmap entries are an enum), but is sanity + * checked in the actual fixmap C code to make sure that the fixmap is + * covered fully. + */ +#define FIXMAP_PMD_NUM 2 +/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ +#define FIXMAP_PMD_TOP 507 + #ifndef __ASSEMBLY__ #include <linux/kernel.h> #include <asm/acpi.h> diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c0643831706e..616f8e637bc3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); +#define __bss_decrypted __attribute__((__section__(".bss..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask 0ULL @@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __bss_decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x) (__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) +extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index ce2b59047cb8..9c85b54bf03c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -14,6 +14,7 @@ #include <asm/processor.h> #include <linux/bitops.h> #include <linux/threads.h> +#include <asm/fixmap.h> extern p4d_t level4_kernel_pgt[512]; extern p4d_t level4_ident_pgt[512]; @@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pmd_t level2_fixmap_pgt[512]; extern pmd_t level2_ident_pgt[512]; -extern pte_t level1_fixmap_pgt[512]; +extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM]; extern pgd_t init_top_pgt[]; #define swapper_pg_dir init_top_pgt diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index 4e588f36228f..285eb3ec4200 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e) e <= QOS_L3_MBM_LOCAL_EVENT_ID); } +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + /** * struct rdt_resource - attributes of an RDT resource * @rid: The index of the resource @@ -423,16 +428,19 @@ struct rdt_resource { struct rdt_cache cache; struct rdt_membw membw; const char *format_str; - int (*parse_ctrlval) (void *data, struct rdt_resource *r, - struct rdt_domain *d); + int (*parse_ctrlval)(struct rdt_parse_data *data, + struct rdt_resource *r, + struct rdt_domain *d); struct list_head evt_list; int num_rmid; unsigned int mon_scale; unsigned long fflags; }; -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d); -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d); extern struct mutex rdtgroup_mutex; @@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int update_domains(struct rdt_resource *r, int closid); +int closids_supported(void); void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c index af358ca05160..0f53049719cd 100644 --- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return true; } -int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d) +int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - unsigned long data; - char *buf = _buf; + unsigned long bw_val; if (d->have_new_ctrl) { rdt_last_cmd_printf("duplicate domain %d\n", d->id); return -EINVAL; } - if (!bw_validate(buf, &data, r)) + if (!bw_validate(data->buf, &bw_val, r)) return -EINVAL; - d->new_ctrl = data; + d->new_ctrl = bw_val; d->have_new_ctrl = true; return 0; @@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) return true; } -struct rdt_cbm_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /* * Read one cache bit mask (hex). Check that it is valid for the current * resource type. */ -int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) +int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r, + struct rdt_domain *d) { - struct rdt_cbm_parse_data *data = _data; struct rdtgroup *rdtgrp = data->rdtgrp; u32 cbm_val; @@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d) static int parse_line(char *line, struct rdt_resource *r, struct rdtgroup *rdtgrp) { - struct rdt_cbm_parse_data data; + struct rdt_parse_data data; char *dom = NULL, *id; struct rdt_domain *d; unsigned long dom_id; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + r->rid == RDT_RESOURCE_MBA) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + next: if (!line || line[0] == '\0') return 0; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index b799c00bef09..1b8e86a5d5e1 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...) * limited as the number of resources grows. */ static int closid_free_map; +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} static void closid_init(void) { @@ -111,6 +117,7 @@ static void closid_init(void) /* CLOSID 0 is always reserved for the default group */ closid_free_map &= ~1; + closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) @@ -802,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->id); - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (!closid_allocated(i)) continue; mode = rdtgroup_mode_by_closid(i); @@ -989,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, /* Check for overlap with other resource groups */ ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { ctrl_b = (unsigned long *)ctrl; mode = rdtgroup_mode_by_closid(i); if (closid_allocated(i) && i != closid && @@ -1024,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; struct rdt_resource *r; + bool has_cache = false; struct rdt_domain *d; for_each_alloc_enabled_rdt_resource(r) { + if (r->rid == RDT_RESOURCE_MBA) + continue; + has_cache = true; list_for_each_entry(d, &r->domains, list) { if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid], - rdtgrp->closid, false)) + rdtgrp->closid, false)) { + rdt_last_cmd_puts("schemata overlaps\n"); return false; + } } } + if (!has_cache) { + rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n"); + return false; + } + return true; } @@ -1085,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, rdtgrp->mode = RDT_MODE_SHAREABLE; } else if (!strcmp(buf, "exclusive")) { if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - rdt_last_cmd_printf("schemata overlaps\n"); ret = -EINVAL; goto out; } @@ -1155,8 +1172,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, struct rdt_resource *r; struct rdt_domain *d; unsigned int size; - bool sep = false; - u32 cbm; + bool sep; + u32 ctrl; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -1174,6 +1191,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, } for_each_alloc_enabled_rdt_resource(r) { + sep = false; seq_printf(s, "%*s:", max_name_width, r->name); list_for_each_entry(d, &r->domains, list) { if (sep) @@ -1181,8 +1199,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { size = 0; } else { - cbm = d->ctrl_val[rdtgrp->closid]; - size = rdtgroup_cbm_to_size(r, d, cbm); + ctrl = (!is_mba_sc(r) ? + d->ctrl_val[rdtgrp->closid] : + d->mbps_val[rdtgrp->closid]); + if (r->rid == RDT_RESOURCE_MBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); } seq_printf(s, "%d=%u", d->id, size); sep = true; @@ -2336,12 +2359,18 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) u32 *ctrl; for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; list_for_each_entry(d, &r->domains, list) { d->have_new_ctrl = false; d->new_ctrl = r->cache.shareable_bits; used_b = r->cache.shareable_bits; ctrl = d->ctrl_val; - for (i = 0; i < r->num_closid; i++, ctrl++) { + for (i = 0; i < closids_supported(); i++, ctrl++) { if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) @@ -2373,6 +2402,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) } for_each_alloc_enabled_rdt_resource(r) { + /* + * Only initialize default allocations for CBM cache + * resources + */ + if (r->rid == RDT_RESOURCE_MBA) + continue; ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("failed to initialize allocations\n"); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..ddee1f0870c4 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -35,6 +35,7 @@ #include <asm/bootparam_utils.h> #include <asm/microcode.h> #include <asm/kasan.h> +#include <asm/fixmap.h> /* * Manage page tables very early on. @@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr, pud[511] += load_delta; pmd = fixup_pointer(level2_fixmap_pgt, physaddr); - pmd[506] += load_delta; + for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) + pmd[i] += load_delta; /* * Set up the identity mapping for the switchover. These @@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr, sme_encrypt_kernel(bp); /* + * Clear the memory encryption mask from the .bss..decrypted section. + * The bss section will be memset to zero later in the initialization so + * there is no need to zero it after changing the memory encryption + * attribute. + */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_bss_decrypted; + vaddr_end = (unsigned long)__end_bss_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 15ebc2fc166e..a3618cf04cf6 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -24,6 +24,7 @@ #include "../entry/calling.h" #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/fixmap.h> #ifdef CONFIG_PARAVIRT #include <asm/asm-offsets.h> @@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt) KERNEL_IMAGE_SIZE/PMD_SIZE) NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 + .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 + pgtno = 0 + .rept (FIXMAP_PMD_NUM) + .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \ + + _PAGE_TABLE_NOENC; + pgtno = pgtno + 1 + .endr + /* 6 MB reserved space + a 2MB hole */ + .fill 4,8,0 NEXT_PAGE(level1_fixmap_pgt) + .rept (FIXMAP_PMD_NUM) .fill 512,8,0 + .endr #undef PMDS diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e6764648af3..013fe3d21dbb 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <linux/sched/clock.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/set_memory.h> #include <asm/hypervisor.h> #include <asm/mem_encrypt.h> @@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __bss_decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +static struct pvclock_vsyscall_time_info *hvclock_mem; static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { @@ -236,6 +238,45 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static void __init kvmclock_init_mem(void) +{ + unsigned long ncpus; + unsigned int order; + struct page *p; + int r; + + if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus()) + return; + + ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE; + order = get_order(ncpus * sizeof(*hvclock_mem)); + + p = alloc_pages(GFP_KERNEL, order); + if (!p) { + pr_warn("%s: failed to alloc %d pages", __func__, (1U << order)); + return; + } + + hvclock_mem = page_address(p); + + /* + * hvclock is shared between the guest and the hypervisor, must + * be mapped decrypted. + */ + if (sev_active()) { + r = set_memory_decrypted((unsigned long) hvclock_mem, + 1UL << order); + if (r) { + __free_pages(p, order); + hvclock_mem = NULL; + pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n"); + return; + } + } + + memset(hvclock_mem, 0, PAGE_SIZE << order); +} + static int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 @@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void) kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif + + kvmclock_init_mem(); + return 0; } early_initcall(kvm_setup_vsyscall_timeinfo); @@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = &hv_clock_boot[cpu]; + else if (hvclock_mem) + p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE; else - p = kzalloc(sizeof(*p), GFP_KERNEL); + return -ENOMEM; per_cpu(hv_clock_per_cpu, cpu) = p; return p ? 0 : -ENOMEM; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index afdb303285f8..8dc69d82567e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a419f86..5dd3317d761f 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,23 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid splitting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define BSS_DECRYPTED \ + . = ALIGN(PMD_SIZE); \ + __start_bss_decrypted = .; \ + *(.bss..decrypted); \ + . = ALIGN(PAGE_SIZE); \ + __start_bss_decrypted_unused = .; \ + . = ALIGN(PMD_SIZE); \ + __end_bss_decrypted = .; \ + #else #define X86_ALIGN_RODATA_BEGIN @@ -74,6 +91,7 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN #define ALIGN_ENTRY_TEXT_END +#define BSS_DECRYPTED #endif @@ -355,6 +373,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) + BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26c1115..faca978ebf9d 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak mem_encrypt_free_decrypted_mem(void) { } + void __ref free_initmem(void) { e820__reallocate_tables(); + mem_encrypt_free_decrypted_mem(); + free_kernel_image_pages(&__init_begin, &__init_end); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index b2de398d1fd3..006f373f54ab 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -348,6 +348,30 @@ bool sev_active(void) EXPORT_SYMBOL(sev_active); /* Architecture __weak replacement functions */ +void __init mem_encrypt_free_decrypted_mem(void) +{ + unsigned long vaddr, vaddr_end, npages; + int r; + + vaddr = (unsigned long)__start_bss_decrypted_unused; + vaddr_end = (unsigned long)__end_bss_decrypted; + npages = (vaddr_end - vaddr) >> PAGE_SHIFT; + + /* + * The unused memory range was mapped decrypted, change the encryption + * attribute from decrypted to encrypted before freeing it. + */ + if (mem_encrypt_active()) { + r = set_memory_encrypted(vaddr, npages); + if (r) { + pr_warn("failed to free unused decrypted pages\n"); + return; + } + } + + free_init_pages("unused decrypted", vaddr, vaddr_end); +} + void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ae394552fb94..089e78c4effd 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); +#ifdef CONFIG_X86_64 + /* + * Ensure that the static initial page tables are covering the + * fixmap completely. + */ + BUILD_BUG_ON(__end_of_permanent_fixed_addresses > + (FIXMAP_PMD_NUM * PTRS_PER_PTE)); +#endif + if (idx >= __end_of_fixed_addresses) { BUG(); return; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 2fe5c9b1816b..dd461c0167ef 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* L3_k[511] -> level2_fixmap_pgt */ convert_pfn_mfn(level3_kernel_pgt); - /* L3_k[511][506] -> level1_fixmap_pgt */ + /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */ convert_pfn_mfn(level2_fixmap_pgt); /* We get [511][511] and have Xen's version of level2_kernel_pgt */ @@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); + + for (i = 0; i < FIXMAP_PMD_NUM; i++) { + set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE, + PAGE_KERNEL_RO); + } /* Pin down new L4 */ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 7d00d4ad44d4..95997e6c0696 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -478,7 +478,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { int err, ret = IRQ_NONE; - struct pt_regs regs; + struct pt_regs regs = {0}; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); uint8_t xenpmu_flags = get_xenpmu_flags(); diff --git a/block/bio.c b/block/bio.c index 8c680a776171..0093bed81c0e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1684,7 +1684,7 @@ void generic_end_io_acct(struct request_queue *q, int req_op, const int sgrp = op_stat_group(req_op); int cpu = part_stat_lock(); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_round_stats(q, cpu, part); part_dec_in_flight(q, part, op_is_write(req_op)); diff --git a/block/blk-core.c b/block/blk-core.c index 4dbc93f43b38..cff0a60ee200 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2733,17 +2733,15 @@ void blk_account_io_done(struct request *req, u64 now) * containing request is enough. */ if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { - unsigned long duration; const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; int cpu; - duration = nsecs_to_jiffies(now - req->start_time_ns); cpu = part_stat_lock(); part = req->part; part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, ticks[sgrp], duration); + part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); diff --git a/block/genhd.c b/block/genhd.c index 8cc719a37b32..be5bab20b2ab 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1343,18 +1343,18 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, ios[STAT_READ]), part_stat_read(hd, merges[STAT_READ]), part_stat_read(hd, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(hd, STAT_READ), part_stat_read(hd, ios[STAT_WRITE]), part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), part_stat_read(hd, merges[STAT_DISCARD]), part_stat_read(hd, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(hd, ticks[STAT_DISCARD])) + (unsigned int)part_stat_read_msecs(hd, STAT_DISCARD) ); } disk_part_iter_exit(&piter); diff --git a/block/partition-generic.c b/block/partition-generic.c index 5a8975a1201c..d3d14e81fb12 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -136,18 +136,18 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, ios[STAT_READ]), part_stat_read(p, merges[STAT_READ]), (unsigned long long)part_stat_read(p, sectors[STAT_READ]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_READ])), + (unsigned int)part_stat_read_msecs(p, STAT_READ), part_stat_read(p, ios[STAT_WRITE]), part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_WRITE])), + (unsigned int)part_stat_read_msecs(p, STAT_WRITE), inflight[0], jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), part_stat_read(p, merges[STAT_DISCARD]), (unsigned long long)part_stat_read(p, sectors[STAT_DISCARD]), - jiffies_to_msecs(part_stat_read(p, ticks[STAT_DISCARD]))); + (unsigned int)part_stat_read_msecs(p, STAT_DISCARD)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index d8e159feb573..89110dfc7127 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -90,14 +90,17 @@ config EFI_ARMSTUB config EFI_ARMSTUB_DTB_LOADER bool "Enable the DTB loader" depends on EFI_ARMSTUB + default y help Select this config option to add support for the dtb= command line parameter, allowing a device tree blob to be loaded into memory from the EFI System Partition by the stub. - The device tree is typically provided by the platform or by - the bootloader, so this option is mostly for development - purposes only. + If the device tree is provided by the platform or by + the bootloader this option may not be needed. + But, for various development reasons and to maintain existing + functionality for bootloaders that do not have such support + this option is necessary. config EFI_BOOTLOADER_CONTROL tristate "EFI Bootloader Control" diff --git a/drivers/mfd/omap-usb-host.c b/drivers/mfd/omap-usb-host.c index e11ab12fbdf2..800986a79704 100644 --- a/drivers/mfd/omap-usb-host.c +++ b/drivers/mfd/omap-usb-host.c @@ -528,8 +528,8 @@ static int usbhs_omap_get_dt_pdata(struct device *dev, } static const struct of_device_id usbhs_child_match_table[] = { - { .compatible = "ti,omap-ehci", }, - { .compatible = "ti,omap-ohci", }, + { .compatible = "ti,ehci-omap", }, + { .compatible = "ti,ohci-omap3", }, { } }; @@ -855,6 +855,7 @@ static struct platform_driver usbhs_omap_driver = { .pm = &usbhsomap_dev_pm_ops, .of_match_table = usbhs_omap_dt_ids, }, + .probe = usbhs_omap_probe, .remove = usbhs_omap_remove, }; @@ -864,9 +865,9 @@ MODULE_ALIAS("platform:" USBHS_DRIVER_NAME); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("usb host common core driver for omap EHCI and OHCI"); -static int __init omap_usbhs_drvinit(void) +static int omap_usbhs_drvinit(void) { - return platform_driver_probe(&usbhs_omap_driver, usbhs_omap_probe); + return platform_driver_register(&usbhs_omap_driver); } /* @@ -878,7 +879,7 @@ static int __init omap_usbhs_drvinit(void) */ fs_initcall_sync(omap_usbhs_drvinit); -static void __exit omap_usbhs_drvexit(void) +static void omap_usbhs_drvexit(void) { platform_driver_unregister(&usbhs_omap_driver); } diff --git a/drivers/pinctrl/intel/pinctrl-cannonlake.c b/drivers/pinctrl/intel/pinctrl-cannonlake.c index fb1afe55bf53..8d48371caaa2 100644 --- a/drivers/pinctrl/intel/pinctrl-cannonlake.c +++ b/drivers/pinctrl/intel/pinctrl-cannonlake.c @@ -379,7 +379,7 @@ static const struct intel_padgroup cnlh_community1_gpps[] = { static const struct intel_padgroup cnlh_community3_gpps[] = { CNL_GPP(0, 155, 178, 192), /* GPP_K */ CNL_GPP(1, 179, 202, 224), /* GPP_H */ - CNL_GPP(2, 203, 215, 258), /* GPP_E */ + CNL_GPP(2, 203, 215, 256), /* GPP_E */ CNL_GPP(3, 216, 239, 288), /* GPP_F */ CNL_GPP(4, 240, 248, CNL_NO_GPIO), /* SPI */ }; diff --git a/drivers/pinctrl/intel/pinctrl-intel.c b/drivers/pinctrl/intel/pinctrl-intel.c index 62b009b27eda..ec8dafc94694 100644 --- a/drivers/pinctrl/intel/pinctrl-intel.c +++ b/drivers/pinctrl/intel/pinctrl-intel.c @@ -747,13 +747,63 @@ static const struct pinctrl_desc intel_pinctrl_desc = { .owner = THIS_MODULE, }; +/** + * intel_gpio_to_pin() - Translate from GPIO offset to pin number + * @pctrl: Pinctrl structure + * @offset: GPIO offset from gpiolib + * @commmunity: Community is filled here if not %NULL + * @padgrp: Pad group is filled here if not %NULL + * + * When coming through gpiolib irqchip, the GPIO offset is not + * automatically translated to pinctrl pin number. This function can be + * used to find out the corresponding pinctrl pin. + */ +static int intel_gpio_to_pin(struct intel_pinctrl *pctrl, unsigned offset, + const struct intel_community **community, + const struct intel_padgroup **padgrp) +{ + int i; + + for (i = 0; i < pctrl->ncommunities; i++) { + const struct intel_community *comm = &pctrl->communities[i]; + int j; + + for (j = 0; j < comm->ngpps; j++) { + const struct intel_padgroup *pgrp = &comm->gpps[j]; + + if (pgrp->gpio_base < 0) + continue; + + if (offset >= pgrp->gpio_base && + offset < pgrp->gpio_base + pgrp->size) { + int pin; + + pin = pgrp->base + offset - pgrp->gpio_base; + if (community) + *community = comm; + if (padgrp) + *padgrp = pgrp; + + return pin; + } + } + } + + return -EINVAL; +} + static int intel_gpio_get(struct gpio_chip *chip, unsigned offset) { struct intel_pinctrl *pctrl = gpiochip_get_data(chip); void __iomem *reg; u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return -EINVAL; @@ -770,8 +820,13 @@ static void intel_gpio_set(struct gpio_chip *chip, unsigned offset, int value) unsigned long flags; void __iomem *reg; u32 padcfg0; + int pin; + + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return; @@ -790,8 +845,13 @@ static int intel_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) struct intel_pinctrl *pctrl = gpiochip_get_data(chip); void __iomem *reg; u32 padcfg0; + int pin; - reg = intel_get_padcfg(pctrl, offset, PADCFG0); + pin = intel_gpio_to_pin(pctrl, offset, NULL, NULL); + if (pin < 0) + return -EINVAL; + + reg = intel_get_padcfg(pctrl, pin, PADCFG0); if (!reg) return -EINVAL; @@ -827,51 +887,6 @@ static const struct gpio_chip intel_gpio_chip = { .set_config = gpiochip_generic_config, }; -/** - * intel_gpio_to_pin() - Translate from GPIO offset to pin number - * @pctrl: Pinctrl structure - * @offset: GPIO offset from gpiolib - * @commmunity: Community is filled here if not %NULL - * @padgrp: Pad group is filled here if not %NULL - * - * When coming through gpiolib irqchip, the GPIO offset is not - * automatically translated to pinctrl pin number. This function can be - * used to find out the corresponding pinctrl pin. - */ -static int intel_gpio_to_pin(struct intel_pinctrl *pctrl, unsigned offset, - const struct intel_community **community, - const struct intel_padgroup **padgrp) -{ - int i; - - for (i = 0; i < pctrl->ncommunities; i++) { - const struct intel_community *comm = &pctrl->communities[i]; - int j; - - for (j = 0; j < comm->ngpps; j++) { - const struct intel_padgroup *pgrp = &comm->gpps[j]; - - if (pgrp->gpio_base < 0) - continue; - - if (offset >= pgrp->gpio_base && - offset < pgrp->gpio_base + pgrp->size) { - int pin; - - pin = pgrp->base + offset - pgrp->gpio_base; - if (community) - *community = comm; - if (padgrp) - *padgrp = pgrp; - - return pin; - } - } - } - - return -EINVAL; -} - static int intel_gpio_irq_reqres(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7bafa703a992..84575baceebc 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1040,18 +1040,33 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, return ret; for (i = 0; i < count; i++) { - /* Retry eagain maps */ - if (map_ops[i].status == GNTST_eagain) - gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, - &map_ops[i].status, __func__); - - if (map_ops[i].status == GNTST_okay) { + switch (map_ops[i].status) { + case GNTST_okay: + { struct xen_page_foreign *foreign; SetPageForeign(pages[i]); foreign = xen_page_foreign(pages[i]); foreign->domid = map_ops[i].dom; foreign->gref = map_ops[i].ref; + break; + } + + case GNTST_no_device_space: + pr_warn_ratelimited("maptrack limit reached, can't map all guest pages\n"); + break; + + case GNTST_eagain: + /* Retry eagain maps */ + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, + map_ops + i, + &map_ops[i].status, __func__); + /* Test status in next loop iteration. */ + i--; + break; + + default: + break; } } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 57864422a2c8..25c08c6c7f99 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -83,10 +83,10 @@ struct partition { } __attribute__((packed)); struct disk_stats { + u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; - unsigned long ticks[NR_STAT_GROUPS]; unsigned long io_ticks; unsigned long time_in_queue; }; @@ -354,6 +354,9 @@ static inline void free_part_stats(struct hd_struct *part) #endif /* CONFIG_SMP */ +#define part_stat_read_msecs(part, which) \ + div_u64(part_stat_read(part, nsecs[which]), NSEC_PER_MSEC) + #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 8bdbb5f29494..74b0aa9c7499 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -357,6 +357,8 @@ #define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt) #define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb) +#define GITS_CBASER_ADDRESS(cbaser) ((cbaser) & GENMASK_ULL(51, 12)) + #define GITS_BASER_NR_REGS 8 #define GITS_BASER_VALID (1ULL << 63) @@ -388,6 +390,9 @@ #define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) #define GITS_BASER_PHYS_52_to_48(phys) \ (((phys) & GENMASK_ULL(47, 16)) | (((phys) >> 48) & 0xf) << 12) +#define GITS_BASER_ADDR_48_to_52(baser) \ + (((baser) & GENMASK_ULL(47, 16)) | (((baser) >> 12) & 0xf) << 48) + #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) diff --git a/include/linux/mfd/da9063/pdata.h b/include/linux/mfd/da9063/pdata.h index 8a125701ef7b..50bed4f89c1a 100644 --- a/include/linux/mfd/da9063/pdata.h +++ b/include/linux/mfd/da9063/pdata.h @@ -21,7 +21,7 @@ /* * Regulator configuration */ -/* DA9063 regulator IDs */ +/* DA9063 and DA9063L regulator IDs */ enum { /* BUCKs */ DA9063_ID_BCORE1, @@ -37,18 +37,20 @@ enum { DA9063_ID_BMEM_BIO_MERGED, /* When two BUCKs are merged, they cannot be reused separately */ - /* LDOs */ + /* LDOs on both DA9063 and DA9063L */ + DA9063_ID_LDO3, + DA9063_ID_LDO7, + DA9063_ID_LDO8, + DA9063_ID_LDO9, + DA9063_ID_LDO11, + + /* DA9063-only LDOs */ DA9063_ID_LDO1, DA9063_ID_LDO2, - DA9063_ID_LDO3, DA9063_ID_LDO4, DA9063_ID_LDO5, DA9063_ID_LDO6, - DA9063_ID_LDO7, - DA9063_ID_LDO8, - DA9063_ID_LDO9, DA9063_ID_LDO10, - DA9063_ID_LDO11, }; /* Regulators platform data */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cb6d44e1fe02..2b7a652c9fa4 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -758,6 +758,15 @@ struct kvm_ppc_resize_hpt { #define KVM_S390_SIE_PAGE_OFFSET 1 /* + * On arm64, machine type can be used to request the physical + * address size for the VM. Bits[7-0] are reserved for the guest + * PA size shift (i.e, log2(PA_Size)). For backward compatibility, + * value 0 implies the default IPA size, 40bits. + */ +#define KVM_VM_TYPE_ARM_IPA_SIZE_MASK 0xffULL +#define KVM_VM_TYPE_ARM_IPA_SIZE(x) \ + ((x) & KVM_VM_TYPE_ARM_IPA_SIZE_MASK) +/* * ioctls for /dev/kvm fds: */ #define KVM_GET_API_VERSION _IO(KVMIO, 0x00) @@ -965,6 +974,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COALESCED_PIO 162 #define KVM_CAP_HYPERV_ENLIGHTENED_VMCS 163 #define KVM_CAP_EXCEPTION_PAYLOAD 164 +#define KVM_CAP_ARM_VM_IPA_SIZE 165 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 13a861135127..6eb9bacd1948 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2abd0f112627..bdb94939fd60 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -50,6 +50,7 @@ #include "libbpf.h" #include "bpf.h" #include "btf.h" +#include "str_error.h" #ifndef EM_BPF #define EM_BPF 247 @@ -469,7 +470,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) obj->efile.fd = open(obj->path, O_RDONLY); if (obj->efile.fd < 0) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(errno, errmsg, sizeof(errmsg)); + char *cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to open %s: %s\n", obj->path, cp); return -errno; @@ -810,8 +811,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj) data->d_size, name, idx); if (err) { char errmsg[STRERR_BUFSIZE]; - char *cp = strerror_r(-err, errmsg, - sizeof(errmsg)); + char *cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to alloc program %s (%s): %s", name, obj->path, cp); @@ -1140,7 +1140,7 @@ bpf_object__create_maps(struct bpf_object *obj) *pfd = bpf_create_map_xattr(&create_attr); if (*pfd < 0 && create_attr.btf_key_type_id) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n", map->name, cp, errno); create_attr.btf_fd = 0; @@ -1155,7 +1155,7 @@ bpf_object__create_maps(struct bpf_object *obj) size_t j; err = *pfd; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to create map (name: '%s'): %s\n", map->name, cp); for (j = 0; j < i; j++) @@ -1339,7 +1339,7 @@ load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, } ret = -LIBBPF_ERRNO__LOAD; - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("load bpf program failed: %s\n", cp); if (log_buf && log_buf[0] != '\0') { @@ -1654,7 +1654,7 @@ static int check_path(const char *path) dir = dirname(dname); if (statfs(dir, &st_fs)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to statfs %s: %s\n", dir, cp); err = -errno; } @@ -1690,7 +1690,7 @@ int bpf_program__pin_instance(struct bpf_program *prog, const char *path, } if (bpf_obj_pin(prog->instances.fds[instance], path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin program: %s\n", cp); return -errno; } @@ -1708,7 +1708,7 @@ static int make_dir(const char *path) err = -errno; if (err) { - cp = strerror_r(-err, errmsg, sizeof(errmsg)); + cp = str_error(-err, errmsg, sizeof(errmsg)); pr_warning("failed to mkdir %s: %s\n", path, cp); } return err; @@ -1770,7 +1770,7 @@ int bpf_map__pin(struct bpf_map *map, const char *path) } if (bpf_obj_pin(map->fd, path)) { - cp = strerror_r(errno, errmsg, sizeof(errmsg)); + cp = str_error(errno, errmsg, sizeof(errmsg)); pr_warning("failed to pin map: %s\n", cp); return -errno; } diff --git a/tools/lib/bpf/str_error.c b/tools/lib/bpf/str_error.c new file mode 100644 index 000000000000..b8798114a357 --- /dev/null +++ b/tools/lib/bpf/str_error.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: LGPL-2.1 +#undef _GNU_SOURCE +#include <string.h> +#include <stdio.h> +#include "str_error.h" + +/* + * Wrapper to allow for building in non-GNU systems such as Alpine Linux's musl + * libc, while checking strerror_r() return to avoid having to check this in + * all places calling it. + */ +char *str_error(int err, char *dst, int len) +{ + int ret = strerror_r(err, dst, len); + if (ret) + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + return dst; +} diff --git a/tools/lib/bpf/str_error.h b/tools/lib/bpf/str_error.h new file mode 100644 index 000000000000..355b1db571d1 --- /dev/null +++ b/tools/lib/bpf/str_error.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: LGPL-2.1 +#ifndef BPF_STR_ERROR +#define BPF_STR_ERROR + +char *str_error(int err, char *dst, int len); +#endif // BPF_STR_ERROR diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile index 42261a9b280e..ac841bc5c35b 100644 --- a/tools/perf/Documentation/Makefile +++ b/tools/perf/Documentation/Makefile @@ -280,7 +280,7 @@ $(MAN_HTML): $(OUTPUT)%.html : %.txt mv $@+ $@ ifdef USE_ASCIIDOCTOR -$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.txt +$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : %.txt $(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ $(ASCIIDOC) -b manpage -d manpage \ $(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index c92053bc3f96..11b98b2b0486 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -120,8 +120,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret, cpu; - if (type) - return -EINVAL; + ret = kvm_arm_setup_stage2(kvm, type); + if (ret) + return ret; kvm->arch.last_vcpu_ran = alloc_percpu(typeof(*kvm->arch.last_vcpu_ran)); if (!kvm->arch.last_vcpu_ran) @@ -212,6 +213,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_VCPU_EVENTS: r = 1; break; case KVM_CAP_ARM_SET_DEVICE_ADDR: @@ -240,7 +242,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; default: - r = kvm_arch_dev_ioctl_check_extension(kvm, ext); + r = kvm_arch_vm_ioctl_check_extension(kvm, ext); break; } return r; @@ -544,7 +546,7 @@ static void update_vttbr(struct kvm *kvm) /* update vttbr to be used with the new vmid */ pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); + BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; @@ -1295,8 +1297,6 @@ static void cpu_init_hyp_mode(void *dummy) __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); - - kvm_arm_init_debug(); } static void cpu_hyp_reset(void) @@ -1309,16 +1309,12 @@ static void cpu_hyp_reinit(void) { cpu_hyp_reset(); - if (is_kernel_in_hyp_mode()) { - /* - * __cpu_init_stage2() is safe to call even if the PM - * event was cancelled before the CPU was reset. - */ - __cpu_init_stage2(); + if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); - } else { + else cpu_init_hyp_mode(NULL); - } + + kvm_arm_init_debug(); if (vgic_present) kvm_vgic_init_cpu_hardware(); @@ -1412,6 +1408,8 @@ static int init_common_resources(void) kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); + kvm_set_ipa_limit(); + return 0; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index ed162a6c57c5..c23a1b323aad 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -45,7 +45,6 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; -#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) @@ -150,20 +149,20 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); - stage2_pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(kvm, pgd, 0UL); + stage2_pgd_clear(kvm, pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pud_free(pud_table); + stage2_pud_free(kvm, pud_table); put_page(virt_to_page(pgd)); } static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); - VM_BUG_ON(stage2_pud_huge(*pud)); - stage2_pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(kvm, pud, 0); + VM_BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - stage2_pmd_free(pmd_table); + stage2_pmd_free(kvm, pmd_table); put_page(virt_to_page(pud)); } @@ -252,7 +251,7 @@ static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (stage2_pte_table_empty(start_pte)) + if (stage2_pte_table_empty(kvm, start_pte)) clear_stage2_pmd_entry(kvm, pmd, start_addr); } @@ -262,9 +261,9 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = stage2_pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; @@ -281,7 +280,7 @@ static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, } } while (pmd++, addr = next, addr != end); - if (stage2_pmd_table_empty(start_pmd)) + if (stage2_pmd_table_empty(kvm, start_pmd)) clear_stage2_pud_entry(kvm, pud, start_addr); } @@ -291,14 +290,14 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = stage2_pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) { pud_t old_pud = *pud; - stage2_pud_clear(pud); + stage2_pud_clear(kvm, pud); kvm_tlb_flush_vmid_ipa(kvm, addr); kvm_flush_dcache_pud(old_pud); put_page(virt_to_page(pud)); @@ -308,7 +307,7 @@ static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, } } while (pud++, addr = next, addr != end); - if (stage2_pud_table_empty(start_pud)) + if (stage2_pud_table_empty(kvm, start_pud)) clear_stage2_pgd_entry(kvm, pgd, start_addr); } @@ -332,7 +331,7 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Make sure the page table is still active, as another thread @@ -341,8 +340,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) */ if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (!stage2_pgd_none(*pgd)) + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) unmap_stage2_puds(kvm, pgd, addr, next); /* * If the range is too large, release the kvm->mmu_lock @@ -371,9 +370,9 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); @@ -389,11 +388,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { - if (stage2_pud_huge(*pud)) + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { + if (stage2_pud_huge(kvm, *pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -409,10 +408,11 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { - next = stage2_pgd_addr_end(addr, end); - stage2_flush_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(kvm, addr, end); + if (!stage2_pgd_none(kvm, *pgd)) + stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -897,7 +897,7 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) } /* Allocate the HW PGD, making sure that each page gets its own refcount */ - pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + pgd = alloc_pages_exact(stage2_pgd_size(kvm), GFP_KERNEL | __GFP_ZERO); if (!pgd) return -ENOMEM; @@ -986,7 +986,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; } @@ -994,7 +994,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) /* Free the HW pgd, one page at a time */ if (pgd) - free_pages_exact(pgd, S2_PGD_SIZE); + free_pages_exact(pgd, stage2_pgd_size(kvm)); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1003,16 +1003,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); - if (WARN_ON(stage2_pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); + if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - stage2_pgd_populate(pgd, pud); + stage2_pgd_populate(kvm, pgd, pud); get_page(virt_to_page(pgd)); } - return stage2_pud_offset(pgd, addr); + return stage2_pud_offset(kvm, pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -1025,15 +1025,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (!pud) return NULL; - if (stage2_pud_none(*pud)) { + if (stage2_pud_none(kvm, *pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - stage2_pud_populate(pud, pmd); + stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } - return stage2_pmd_offset(pud, addr); + return stage2_pmd_offset(kvm, pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -1207,8 +1207,9 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (writable) pte = kvm_s2pte_mkwrite(pte); - ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + ret = mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm), + KVM_NR_MEM_OBJS); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1230,8 +1231,14 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) { kvm_pfn_t pfn = *pfnp; gfn_t gfn = *ipap >> PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); - if (PageTransCompoundMap(pfn_to_page(pfn))) { + /* + * PageTransCompoungMap() returns true for THP and + * hugetlbfs. Make sure the adjustment is done only for THP + * pages. + */ + if (!PageHuge(page) && PageTransCompoundMap(page)) { unsigned long mask; /* * The address we faulted on is backed by a transparent huge @@ -1296,19 +1303,21 @@ static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) /** * stage2_wp_pmds - write protect PUD range + * kvm: kvm instance for the VM * @pud: pointer to pud entry * @addr: range start address * @end: range end address */ -static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, + phys_addr_t addr, phys_addr_t end) { pmd_t *pmd; phys_addr_t next; - pmd = stage2_pmd_offset(pud, addr); + pmd = stage2_pmd_offset(kvm, pud, addr); do { - next = stage2_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(kvm, addr, end); if (!pmd_none(*pmd)) { if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) @@ -1328,18 +1337,19 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) * * Process PUD entries, for a huge PUD we cause a panic. */ -static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, + phys_addr_t addr, phys_addr_t end) { pud_t *pud; phys_addr_t next; - pud = stage2_pud_offset(pgd, addr); + pud = stage2_pud_offset(kvm, pgd, addr); do { - next = stage2_pud_addr_end(addr, end); - if (!stage2_pud_none(*pud)) { + next = stage2_pud_addr_end(kvm, addr, end); + if (!stage2_pud_none(kvm, *pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(*pud)); - stage2_wp_pmds(pud, addr, next); + BUG_ON(stage2_pud_huge(kvm, *pud)); + stage2_wp_pmds(kvm, pud, addr, next); } } while (pud++, addr = next, addr != end); } @@ -1355,7 +1365,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + stage2_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(kvm, addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1369,9 +1379,9 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) cond_resched_lock(&kvm->mmu_lock); if (!READ_ONCE(kvm->arch.pgd)) break; - next = stage2_pgd_addr_end(addr, end); - if (stage2_pgd_present(*pgd)) - stage2_wp_puds(pgd, addr, next); + next = stage2_pgd_addr_end(kvm, addr, end); + if (stage2_pgd_present(kvm, *pgd)) + stage2_wp_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1520,7 +1530,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, + ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), KVM_NR_MEM_OBJS); if (ret) return ret; @@ -1763,7 +1773,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) } /* Userspace should not be able to register out-of-bounds IPAs */ - VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE); + VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); if (fault_status == FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); @@ -2062,7 +2072,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest IPA space. */ if (memslot->base_gfn + memslot->npages >= - (KVM_PHYS_SIZE >> PAGE_SHIFT)) + (kvm_phys_size(kvm) >> PAGE_SHIFT)) return -EFAULT; down_read(¤t->mm->mmap_sem); diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 12502251727e..eb2a390a6c86 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -241,13 +241,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id, list_for_each_entry(dev, &(its)->device_list, dev_list) \ list_for_each_entry(ite, &(dev)->itt_head, ite_list) -/* - * We only implement 48 bits of PA at the moment, although the ITS - * supports more. Let's be restrictive here. - */ -#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16)) -#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12)) - #define GIC_LPI_OFFSET 8192 #define VITS_TYPER_IDBITS 16 @@ -759,6 +752,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, { int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; u64 indirect_ptr, type = GITS_BASER_TYPE(baser); + phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); int index; gfn_t gfn; @@ -783,7 +777,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (id >= (l1_tbl_size / esz)) return false; - addr = BASER_ADDRESS(baser) + id * esz; + addr = base + id * esz; gfn = addr >> PAGE_SHIFT; if (eaddr) @@ -798,7 +792,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, /* Each 1st level entry is represented by a 64-bit value. */ if (kvm_read_guest_lock(its->dev->kvm, - BASER_ADDRESS(baser) + index * sizeof(indirect_ptr), + base + index * sizeof(indirect_ptr), &indirect_ptr, sizeof(indirect_ptr))) return false; @@ -808,11 +802,7 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (!(indirect_ptr & BIT_ULL(63))) return false; - /* - * Mask the guest physical address and calculate the frame number. - * Any address beyond our supported 48 bits of PA will be caught - * by the actual check in the final step. - */ + /* Mask the guest physical address and calculate the frame number. */ indirect_ptr &= GENMASK_ULL(51, 16); /* Find the address of the actual entry */ @@ -1304,9 +1294,6 @@ static u64 vgic_sanitise_its_baser(u64 reg) GITS_BASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */ - reg &= ~GENMASK_ULL(15, 12); - /* We support only one (ITS) page size: 64K */ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K; @@ -1325,11 +1312,8 @@ static u64 vgic_sanitise_its_cbaser(u64 reg) GITS_CBASER_OUTER_CACHEABILITY_SHIFT, vgic_sanitise_outer_cacheability); - /* - * Sanitise the physical address to be 64k aligned. - * Also limit the physical addresses to 48 bits. - */ - reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12)); + /* Sanitise the physical address to be 64k aligned. */ + reg &= ~GENMASK_ULL(15, 12); return reg; } @@ -1375,7 +1359,7 @@ static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its) if (!its->enabled) return; - cbaser = CBASER_ADDRESS(its->cbaser); + cbaser = GITS_CBASER_ADDRESS(its->cbaser); while (its->cwriter != its->creadr) { int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr, @@ -2233,7 +2217,7 @@ static int vgic_its_restore_device_tables(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - l1_gpa = BASER_ADDRESS(baser); + l1_gpa = GITS_BASER_ADDR_48_to_52(baser); if (baser & GITS_BASER_INDIRECT) { l1_esz = GITS_LVL1_ENTRY_SIZE; @@ -2305,7 +2289,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); u64 baser = its->baser_coll_table; - gpa_t gpa = BASER_ADDRESS(baser); + gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; u64 val; size_t max_size, filled = 0; @@ -2354,7 +2338,7 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) if (!(baser & GITS_BASER_VALID)) return 0; - gpa = BASER_ADDRESS(baser); + gpa = GITS_BASER_ADDR_48_to_52(baser); max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K; diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c index 6ada2432e37c..114dce9f4bf5 100644 --- a/virt/kvm/arm/vgic/vgic-kvm-device.c +++ b/virt/kvm/arm/vgic/vgic-kvm-device.c @@ -25,7 +25,7 @@ int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment) { - if (addr & ~KVM_PHYS_MASK) + if (addr & ~kvm_phys_mask(kvm)) return -E2BIG; if (!IS_ALIGNED(addr, alignment)) diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index a2a175b08b17..b3d1f0985117 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -364,7 +364,6 @@ static u64 vgic_sanitise_pendbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PENDBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } @@ -382,7 +381,6 @@ static u64 vgic_sanitise_propbaser(u64 reg) vgic_sanitise_outer_cacheability); reg &= ~PROPBASER_RES0_MASK; - reg &= ~GENMASK_ULL(51, 48); return reg; } |