diff options
Diffstat (limited to 'arch')
267 files changed, 4388 insertions, 3554 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7deb3ea2dd3f..b5dce13a6132 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -119,9 +119,6 @@ config GENERIC_HWEIGHT bool default y -config ARCH_HAS_DMA_SET_COHERENT_MASK - bool - config PPC bool default y @@ -131,10 +128,10 @@ config PPC select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED - select ARCH_HAS_DMA_SET_COHERENT_MASK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL @@ -203,7 +200,7 @@ config PPC select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_KERNEL_GZIP - select HAVE_KERNEL_XZ if PPC_BOOK3S + select HAVE_KERNEL_XZ if PPC_BOOK3S || 44x select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES @@ -222,7 +219,7 @@ config PPC select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN + select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_IRQ_TIME_ACCOUNTING @@ -243,6 +240,7 @@ config PPC select RTC_LIB select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select VIRT_TO_BUS if !PPC64 # # Please keep this list sorted alphabetically. @@ -253,9 +251,6 @@ config PPC_BARRIER_NOSPEC default y depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E -config GENERIC_CSUM - def_bool n - config EARLY_PRINTK bool default y @@ -475,9 +470,6 @@ config ARCH_CPU_PROBE_RELEASE config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y -config ARCH_HAS_WALK_MEMORY - def_bool y - config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y @@ -693,7 +685,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) + depends on 44x || PPC_BOOK3S_64 select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES @@ -711,6 +703,13 @@ config PPC_256K_PAGES endchoice +config PPC_PAGE_SHIFT + int + default 18 if PPC_256K_PAGES + default 16 if PPC_64K_PAGES + default 14 if PPC_16K_PAGES + default 12 + config THREAD_SHIFT int "Thread shift" if EXPERT range 13 15 @@ -721,6 +720,59 @@ config THREAD_SHIFT Used to define the stack size. The default is almost always what you want. Only change this if you know what you are doing. +config ETEXT_SHIFT_BOOL + bool "Set custom etext alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel end of text alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. + +config ETEXT_SHIFT + int "_etext shift" if ETEXT_SHIFT_BOOL + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx + default 17 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 19 if STRICT_KERNEL_RWX && PPC_8xx + default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), IBATs are used to map kernel text. + Smaller is the alignment, greater is the number of necessary IBATs. + + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + +config DATA_SHIFT_BOOL + bool "Set custom data alignment" if STRICT_KERNEL_RWX && \ + (PPC_BOOK3S_32 || PPC_8xx) + depends on ADVANCED_OPTIONS + help + This option allows you to set the kernel data alignment. When + RAM is mapped by blocks, the alignment needs to fit the size and + number of possible blocks. The default should be OK for most configs. + + Say N here unless you know what you are doing. + +config DATA_SHIFT + int "Data shift" if DATA_SHIFT_BOOL + default 24 if STRICT_KERNEL_RWX && PPC64 + range 17 28 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + range 19 23 if STRICT_KERNEL_RWX && PPC_8xx + default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32 + default 23 if STRICT_KERNEL_RWX && PPC_8xx + default PPC_PAGE_SHIFT + help + On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO. + Smaller is the alignment, greater is the number of necessary DBATs. + + On 8xx, large pages (512kb or 8M) are used to map kernel linear + memory. Aligning to 8M reduces TLB misses as only 8M pages are used + in that case. + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES @@ -887,6 +939,7 @@ config FSL_SOC config FSL_PCI bool + select ARCH_HAS_DMA_SET_MASK select PPC_INDIRECT_PCI select PCI_QUIRKS diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index f4961fbcb48d..4e00cb0a5464 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -361,10 +361,6 @@ config PPC_PTDUMP If you are unsure, say N. -config PPC_HTDUMP - def_bool y - depends on PPC_PTDUMP && PPC_BOOK3S_64 - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 488c9edffa58..7de49889bd5d 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -213,9 +213,9 @@ endif asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr) -KBUILD_AFLAGS += -Iarch/$(ARCH) $(AFLAGS-y) +KBUILD_AFLAGS += $(AFLAGS-y) KBUILD_CFLAGS += $(call cc-option,-msoft-float) -KBUILD_CFLAGS += -pipe -Iarch/$(ARCH) $(CFLAGS-y) +KBUILD_CFLAGS += -pipe $(CFLAGS-y) CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__ @@ -427,6 +427,13 @@ else endif endif +ifdef CONFIG_SMP +prepare: task_cpu_prepare + +task_cpu_prepare: prepare0 + $(eval KBUILD_CFLAGS += -D_TASK_CPU=$(shell awk '{if ($$2 == "TASK_CPU") print $$3;}' include/generated/asm-offsets.h)) +endif + # Check toolchain versions: # - gcc-4.6 is the minimum kernel-wide version so nothing required. checkbin: diff --git a/arch/powerpc/boot/dts/Makefile b/arch/powerpc/boot/dts/Makefile index fb335d05aae8..1cbc0e4ce857 100644 --- a/arch/powerpc/boot/dts/Makefile +++ b/arch/powerpc/boot/dts/Makefile @@ -4,3 +4,4 @@ subdir-y += fsl dtstree := $(srctree)/$(src) dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts)) +dtb-$(CONFIG_XILINX_VIRTEX440_GENERIC_BOARD) += virtex440-ml507.dtb virtex440-ml510.dtb diff --git a/arch/powerpc/boot/dts/akebono.dts b/arch/powerpc/boot/dts/akebono.dts index 8a7a10139bc9..cd9d66041a3f 100644 --- a/arch/powerpc/boot/dts/akebono.dts +++ b/arch/powerpc/boot/dts/akebono.dts @@ -40,7 +40,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/bluestone.dts b/arch/powerpc/boot/dts/bluestone.dts index b0b26d8d68a2..64eaf7e09d22 100644 --- a/arch/powerpc/boot/dts/bluestone.dts +++ b/arch/powerpc/boot/dts/bluestone.dts @@ -109,7 +109,7 @@ OCM: ocm@400040000 { compatible = "ibm,ocm"; - status = "ok"; + status = "okay"; cell-index = <1>; /* configured in U-Boot */ reg = <4 0x00040000 0x8000>; /* 32K */ diff --git a/arch/powerpc/boot/dts/currituck.dts b/arch/powerpc/boot/dts/currituck.dts index a04a4fcfde63..b6d87b9c2cef 100644 --- a/arch/powerpc/boot/dts/currituck.dts +++ b/arch/powerpc/boot/dts/currituck.dts @@ -39,7 +39,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/iss4xx-mpic.dts b/arch/powerpc/boot/dts/iss4xx-mpic.dts index f7063198b2dc..c9f90f1a9c8e 100644 --- a/arch/powerpc/boot/dts/iss4xx-mpic.dts +++ b/arch/powerpc/boot/dts/iss4xx-mpic.dts @@ -43,7 +43,7 @@ d-cache-size = <32768>; dcr-controller; dcr-access-method = "native"; - status = "ok"; + status = "okay"; }; cpu@1 { device_type = "cpu"; diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index 104b1d6d5695..c406bdb4f36f 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -14,6 +14,7 @@ /dts-v1/; #include <dt-bindings/gpio/gpio.h> +#include <dt-bindings/input/input.h> /* * This is commented-out for now. @@ -187,6 +188,11 @@ "DEBUG0", "DEBUG1", "DEBUG2", "DEBUG3", "DEBUG4", "DEBUG5", "DEBUG6", "DEBUG7"; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = <10>; + interrupt-parent = <&PIC1>; + /* * This is commented out while a standard binding * for i2c over gpio is defined. @@ -235,5 +241,21 @@ panic-indicator; }; }; + + gpio-keys { + compatible = "gpio-keys"; + + power { + label = "Power Button"; + gpios = <&GPIO 0 GPIO_ACTIVE_HIGH>; + linux,code = <KEY_POWER>; + }; + + eject { + label = "Eject Button"; + gpios = <&GPIO 6 GPIO_ACTIVE_HIGH>; + linux,code = <KEY_EJECTCD>; + }; + }; }; diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 1d911f68a23b..296584e6dd55 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -23,8 +23,8 @@ #include <uapi/asm/ucontext.h> /* SMP */ -extern struct thread_info *current_set[NR_CPUS]; -extern struct thread_info *secondary_ti; +extern struct task_struct *current_set[NR_CPUS]; +extern struct task_struct *secondary_current; void start_secondary(void *unused); /* kexec */ @@ -37,13 +37,11 @@ void kexec_copy_flush(struct kimage *image); extern struct static_key hcall_tracepoint_key; void __trace_hcall_entry(unsigned long opcode, unsigned long *args); void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf); -/* OPAL tracing */ -#ifdef CONFIG_JUMP_LABEL -extern struct static_key opal_tracepoint_key; -#endif -void __trace_opal_entry(unsigned long opcode, unsigned long *args); -void __trace_opal_exit(long opcode, unsigned long retval); +/* OPAL */ +int64_t __opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, + int64_t opcode, uint64_t msr); /* VMX copying */ int enter_vmx_usercopy(void); diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 0c261ba2c826..5cb588395fdc 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -92,6 +92,8 @@ typedef struct { unsigned long vdso_base; } mm_context_t; +void update_bats(void); + /* patch sites */ extern s32 patch__hash_page_A0, patch__hash_page_A1, patch__hash_page_A2; extern s32 patch__hash_page_B, patch__hash_page_C; diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 49d76adb9bc5..aa8406b8f7ba 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -174,7 +174,18 @@ static inline bool pte_user(pte_t pte) * of RAM. -- Cort */ #define VMALLOC_OFFSET (0x1000000) /* 16M */ + +/* + * With CONFIG_STRICT_KERNEL_RWX, kernel segments are set NX. But when modules + * are used, NX cannot be set on VMALLOC space. So vmalloc VM space and linear + * memory shall not share segments. + */ +#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_MODULES) +#define VMALLOC_START ((_ALIGN((long)high_memory, 256L << 20) + VMALLOC_OFFSET) & \ + ~(VMALLOC_OFFSET - 1)) +#else #define VMALLOC_START ((((long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))) +#endif #define VMALLOC_END ioremap_bot #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 247aff9cc6ba..54b7af6cd27f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,22 +40,36 @@ #else #define H_PUD_CACHE_INDEX (H_PUD_INDEX_SIZE) #endif + /* - * Define the address range of the kernel non-linear virtual area + * Define the address range of the kernel non-linear virtual area. In contrast + * to the linear mapping, this is managed using the kernel page tables and then + * inserted into the hash page table to actually take effect, similarly to user + * mappings. */ #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */ /* - * The vmalloc space starts at the beginning of that region, and - * occupies half of it on hash CPUs and a quarter of it on Book3E - * (we keep a quarter for the virtual memmap) + * Allow virtual mapping of one context size. + * 512TB for 64K page size + * 64TB for 4K page size + */ +#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT) + +/* + * 8TB IO mapping size + */ +#define H_KERN_IO_SIZE ASM_CONST(0x80000000000) /* 8T */ + +/* + * The vmalloc space starts at the beginning of the kernel non-linear virtual + * region, and occupies 504T (64K) or 56T (4K) */ -#define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ -#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) -#define H_KERN_IO_START H_VMALLOC_END +#define H_KERN_IO_START H_VMALLOC_END /* * Region IDs diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 12e522807f9f..a28a28079edb 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -23,7 +23,7 @@ */ #include <asm/book3s/64/pgtable.h> #include <asm/bug.h> -#include <asm/processor.h> +#include <asm/task_size_64.h> #include <asm/cpu_has_feature.h> /* diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 9c1173283b96..138bc2ecc0c4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); + *pgd = __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); + *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS); } static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, @@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); + *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS); } static inline pgtable_t pmd_pgtable(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 868fcaf56f6b..581f91be9dd4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -811,7 +811,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, return hash__set_pte_at(mm, addr, ptep, pte, percpu); } -#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) +#define _PAGE_CACHE_CTL (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t prot) @@ -851,11 +851,6 @@ static inline bool pte_ci(pte_t pte) return false; } -static inline void pmd_set(pmd_t *pmdp, unsigned long val) -{ - *pmdp = __pmd(val); -} - static inline void pmd_clear(pmd_t *pmdp) { *pmdp = __pmd(0); @@ -887,11 +882,6 @@ static inline int pmd_bad(pmd_t pmd) return hash__pmd_bad(pmd); } -static inline void pud_set(pud_t *pudp, unsigned long val) -{ - *pudp = __pud(val); -} - static inline void pud_clear(pud_t *pudp) { *pudp = __pud(0); @@ -934,10 +924,6 @@ static inline bool pud_access_permitted(pud_t pud, bool write) } #define pgd_write(pgd) pte_write(pgd_pte(pgd)) -static inline void pgd_set(pgd_t *pgdp, unsigned long val) -{ - *pgdp = __pgd(val); -} static inline void pgd_clear(pgd_t *pgdp) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 671316f9e95d..05147cecb8df 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -13,8 +13,32 @@ static inline int mmu_get_ap(int psize) #ifdef CONFIG_PPC_RADIX_MMU extern void radix__tlbiel_all(unsigned int action); +extern void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size); +extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); +extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #else static inline void radix__tlbiel_all(unsigned int action) { WARN_ON(1); }; +static inline void radix__flush_tlb_lpid_page(unsigned int lpid, + unsigned long addr, + unsigned long page_size) +{ + WARN_ON(1); +} +static inline void radix__flush_pwc_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__flush_tlb_lpid(unsigned int lpid) +{ + WARN_ON(1); +} +static inline void radix__local_flush_tlb_lpid_guest(unsigned int lpid) +{ + WARN_ON(1); +} #endif extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, @@ -49,12 +73,6 @@ extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_all(void); -extern void radix__flush_tlb_lpid_page(unsigned int lpid, - unsigned long addr, - unsigned long page_size); -extern void radix__flush_pwc_lpid(unsigned int lpid); -extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); -extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); #endif diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index a78a57e5058d..72a65d744a28 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -9,9 +9,6 @@ * 2 of the License, or (at your option) any later version. */ -#ifdef CONFIG_GENERIC_CSUM -#include <asm-generic/checksum.h> -#else #include <linux/bitops.h> #include <linux/in6.h> /* @@ -217,6 +214,5 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); -#endif #endif /* __KERNEL__ */ #endif diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 0245bfcaac32..a130be13ee83 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -20,6 +20,11 @@ struct iommu_table; */ struct dev_archdata { /* + * Set to %true if the dma_iommu_ops are requested to use a direct + * window instead of dynamically mapping memory. + */ + bool iommu_bypass : 1; + /* * These two used to be a union. However, with the hybrid ops we need * both so here we store both a DMA offset for direct mappings and * an iommu_table for remapped DMA. @@ -33,9 +38,6 @@ struct dev_archdata { #ifdef CONFIG_IOMMU_API void *iommu_domain; #endif -#ifdef CONFIG_SWIOTLB - dma_addr_t max_direct_dma_addr; -#endif #ifdef CONFIG_PPC64 struct pci_dn *pci_data; #endif @@ -54,6 +56,4 @@ struct pdev_archdata { u64 dma_mask; }; -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/powerpc/include/asm/dma-direct.h b/arch/powerpc/include/asm/dma-direct.h index 7702875aabb7..a2912b47102c 100644 --- a/arch/powerpc/include/asm/dma-direct.h +++ b/arch/powerpc/include/asm/dma-direct.h @@ -4,26 +4,24 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { -#ifdef CONFIG_SWIOTLB - struct dev_archdata *sd = &dev->archdata; - - if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr) - return false; -#endif - if (!dev->dma_mask) return false; - return addr + size - 1 <= *dev->dma_mask; + return addr + size - 1 <= + min_not_zero(*dev->dma_mask, dev->bus_dma_mask); } static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr) { - return paddr + get_dma_offset(dev); + if (!dev) + return paddr + PCI_DRAM_OFFSET; + return paddr + dev->archdata.dma_offset; } static inline phys_addr_t __dma_to_phys(struct device *dev, dma_addr_t daddr) { - return daddr - get_dma_offset(dev); + if (!dev) + return daddr - PCI_DRAM_OFFSET; + return daddr - dev->archdata.dma_offset; } #endif /* ASM_POWERPC_DMA_DIRECT_H */ diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index ebf66809f2d3..565d6f74b189 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -1,74 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2004 IBM - * - * Implements the generic device dma API for powerpc. - * the pci and vio busses */ #ifndef _ASM_DMA_MAPPING_H #define _ASM_DMA_MAPPING_H -#ifdef __KERNEL__ - -#include <linux/types.h> -#include <linux/cache.h> -/* need struct page definitions */ -#include <linux/mm.h> -#include <linux/scatterlist.h> -#include <linux/dma-debug.h> -#include <asm/io.h> -#include <asm/swiotlb.h> - -/* Some dma direct funcs must be visible for use in other dma_ops */ -extern void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs); -extern void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs); -extern int dma_nommu_mmap_coherent(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, - size_t size, unsigned long attrs); - -#ifdef CONFIG_NOT_COHERENT_CACHE -/* - * DMA-consistent mapping functions for PowerPCs that don't support - * cache snooping. These allocate/free a region of uncached mapped - * memory space for use with DMA devices. Alternatively, you could - * allocate the space "normally" and use the cache management functions - * to ensure it is consistent. - */ -struct device; -extern void __dma_sync(void *vaddr, size_t size, int direction); -extern void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction); -extern unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr); - -#else /* ! CONFIG_NOT_COHERENT_CACHE */ -/* - * Cache coherent cores. - */ - -#define __dma_sync(addr, size, rw) ((void)0) -#define __dma_sync_page(pg, off, sz, rw) ((void)0) - -#endif /* ! CONFIG_NOT_COHERENT_CACHE */ - -static inline unsigned long device_to_mask(struct device *dev) -{ - if (dev->dma_mask && *dev->dma_mask) - return *dev->dma_mask; - /* Assume devices without mask can take 32 bit addresses */ - return 0xfffffffful; -} - -/* - * Available generic sets of operations - */ -#ifdef CONFIG_PPC64 -extern struct dma_map_ops dma_iommu_ops; -#endif -extern const struct dma_map_ops dma_nommu_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { @@ -80,31 +15,4 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) return NULL; } -/* - * get_dma_offset() - * - * Get the dma offset on configurations where the dma address can be determined - * from the physical address by looking at a simple offset. Direct dma and - * swiotlb use this function, but it is typically not used by implementations - * with an iommu. - */ -static inline dma_addr_t get_dma_offset(struct device *dev) -{ - if (dev) - return dev->archdata.dma_offset; - - return PCI_DRAM_OFFSET; -} - -static inline void set_dma_offset(struct device *dev, dma_addr_t off) -{ - if (dev) - dev->archdata.dma_offset = off; -} - -#define HAVE_ARCH_DMA_SET_MASK 1 - -extern u64 __dma_get_required_mask(struct device *dev); - -#endif /* __KERNEL__ */ #endif /* _ASM_DMA_MAPPING_H */ diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8b596d096ebe..94cfcf33030a 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -219,7 +219,8 @@ struct eeh_ops { }; extern int eeh_subsystem_flags; -extern int eeh_max_freezes; +extern u32 eeh_max_freezes; +extern bool eeh_debugfs_no_recover; extern struct eeh_ops *eeh_ops; extern raw_spinlock_t confirm_error_lock; @@ -293,14 +294,14 @@ void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state); +int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); int eeh_dev_open(struct pci_dev *pdev); void eeh_dev_release(struct pci_dev *pdev); struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); int eeh_pe_set_option(struct eeh_pe *pe, int option); int eeh_pe_get_state(struct eeh_pe *pe); -int eeh_pe_reset(struct eeh_pe *pe, int option); +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed); int eeh_pe_configure(struct eeh_pe *pe); int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func, unsigned long addr, unsigned long mask); @@ -460,6 +461,9 @@ static inline void eeh_readsl(const volatile void __iomem *addr, void * buf, eeh_check_failure(addr); } + +void eeh_cache_debugfs_init(void); + #endif /* CONFIG_PPC64 */ #endif /* __KERNEL__ */ #endif /* _POWERPC_EEH_H */ diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h index 9884e872686f..6d0412b846ac 100644 --- a/arch/powerpc/include/asm/eeh_event.h +++ b/arch/powerpc/include/asm/eeh_event.h @@ -33,6 +33,7 @@ struct eeh_event { int eeh_event_init(void); int eeh_send_failure_event(struct eeh_pe *pe); +int __eeh_send_failure_event(struct eeh_pe *pe); void eeh_remove_event(struct eeh_pe *pe, bool force); void eeh_handle_normal_event(struct eeh_pe *pe); void eeh_handle_special_event(void); diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 3b4767ed3ec5..937bb630093f 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -671,7 +671,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define RUNLATCH_ON \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r3, r1); \ + ld r3, PACA_THREAD_INFO(r13); \ ld r4,TI_LOCAL_FLAGS(r3); \ andi. r0,r4,_TLF_RUNLATCH; \ beql ppc64_runlatch_on_trampoline; \ @@ -721,7 +721,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CTRL) #ifdef CONFIG_PPC_970_NAP #define FINISH_NAP \ BEGIN_FTR_SECTION \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r9,TI_LOCAL_FLAGS(r11); \ andi. r10,r9,_TLF_NAPPING; \ bnel power4_fixup_nap; \ diff --git a/arch/powerpc/include/asm/hvsi.h b/arch/powerpc/include/asm/hvsi.h index 3fdc54df63c9..464a7519ed64 100644 --- a/arch/powerpc/include/asm/hvsi.h +++ b/arch/powerpc/include/asm/hvsi.h @@ -64,7 +64,7 @@ struct hvsi_priv { unsigned int inbuf_len; /* data in input buffer */ unsigned char inbuf[HVSI_INBUF_SIZE]; unsigned int inbuf_cur; /* Cursor in input buffer */ - unsigned int inbuf_pktlen; /* packet lenght from cursor */ + unsigned int inbuf_pktlen; /* packet length from cursor */ atomic_t seqno; /* packet sequence number */ unsigned int opened:1; /* driver opened */ unsigned int established:1; /* protocol established */ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 17524d222a7b..0ac52392ed99 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -237,6 +237,7 @@ static inline void iommu_del_device(struct device *dev) } #endif /* !CONFIG_IOMMU_API */ +u64 dma_iommu_get_required_mask(struct device *dev); #else static inline void *get_iommu_table_base(struct device *dev) @@ -318,5 +319,21 @@ extern void iommu_release_ownership(struct iommu_table *tbl); extern enum dma_data_direction iommu_tce_direction(unsigned long tce); extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); +#ifdef CONFIG_PPC_CELL_NATIVE +extern bool iommu_fixed_is_weak; +#else +#define iommu_fixed_is_weak false +#endif + +extern const struct dma_map_ops dma_iommu_ops; + +static inline unsigned long device_to_mask(struct device *dev) +{ + if (dev->dma_mask && *dev->dma_mask) + return *dev->dma_mask; + /* Assume devices without mask can take 32 bit addresses */ + return 0xfffffffful; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/include/asm/ipic.h b/arch/powerpc/include/asm/ipic.h index 3dbd47f2bffe..abad50a745db 100644 --- a/arch/powerpc/include/asm/ipic.h +++ b/arch/powerpc/include/asm/ipic.h @@ -69,10 +69,7 @@ enum ipic_mcp_irq { IPIC_MCP_MU = 7, }; -extern void ipic_set_highest_priority(unsigned int irq); extern void ipic_set_default_priority(void); -extern void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq); -extern void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq); extern u32 ipic_get_mcp_status(void); extern void ipic_clear_mcp_status(u32 mask); diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index ee39ce56b2a2..c91a60cda4fa 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -48,23 +48,19 @@ struct pt_regs; * Per-cpu stacks for handling critical, debug and machine check * level interrupts. */ -extern struct thread_info *critirq_ctx[NR_CPUS]; -extern struct thread_info *dbgirq_ctx[NR_CPUS]; -extern struct thread_info *mcheckirq_ctx[NR_CPUS]; -extern void exc_lvl_ctx_init(void); -#else -#define exc_lvl_ctx_init() +extern void *critirq_ctx[NR_CPUS]; +extern void *dbgirq_ctx[NR_CPUS]; +extern void *mcheckirq_ctx[NR_CPUS]; #endif /* * Per-cpu stacks for handling hard and soft interrupts. */ -extern struct thread_info *hardirq_ctx[NR_CPUS]; -extern struct thread_info *softirq_ctx[NR_CPUS]; +extern void *hardirq_ctx[NR_CPUS]; +extern void *softirq_ctx[NR_CPUS]; -extern void irq_ctx_init(void); -extern void call_do_softirq(struct thread_info *tp); -extern void call_do_irq(struct pt_regs *regs, struct thread_info *tp); +void call_do_softirq(void *sp); +void call_do_irq(struct pt_regs *regs, void *sp); extern void do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); extern void __do_irq(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eb0d79f0ca45..a6c8548ed9fa 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -141,6 +141,7 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu); @@ -632,7 +633,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, unsigned int yield_count); long kvmppc_h_random(struct kvm_vcpu *vcpu); void kvmhv_commence_exit(int trap); -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); void kvmppc_subcore_enter_guest(void); void kvmppc_subcore_exit_guest(void); long kvmppc_realmode_hmi_handler(void); diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 47a03b9b528b..5070df19d463 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/sched/task_stack.h> #ifdef CONFIG_LIVEPATCH static inline int klp_check_compiler_support(void) @@ -43,13 +44,13 @@ static inline unsigned long klp_get_ftrace_location(unsigned long faddr) return ftrace_location_range(faddr, faddr + 16); } -static inline void klp_init_thread_info(struct thread_info *ti) +static inline void klp_init_thread_info(struct task_struct *p) { /* + 1 to account for STACK_END_MAGIC */ - ti->livepatch_sp = (unsigned long *)(ti + 1) + 1; + task_thread_info(p)->livepatch_sp = end_of_stack(p) + 1; } #else -static void klp_init_thread_info(struct thread_info *ti) { } +static inline void klp_init_thread_info(struct task_struct *p) { } #endif /* CONFIG_LIVEPATCH */ #endif /* _ASM_POWERPC_LIVEPATCH_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 8311869005fa..2f0ca6560e47 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -47,9 +47,7 @@ struct machdep_calls { #endif #endif /* CONFIG_PPC64 */ - /* Platform set_dma_mask and dma_get_required_mask overrides */ - int (*dma_set_mask)(struct device *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct device *dev); + void (*dma_set_mask)(struct device *dev, u64 dma_mask); int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index a8b8903e1844..17996bc9382b 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -209,7 +209,7 @@ extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode); + bool user_mode, bool in_guest); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 25607604a7a5..d34ad1657d7b 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -289,6 +289,17 @@ static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) } #endif /* CONFIG_PPC_MEM_KEYS */ +#ifdef CONFIG_STRICT_KERNEL_RWX +static inline bool strict_kernel_rwx_enabled(void) +{ + return rodata_enabled; +} +#else +static inline bool strict_kernel_rwx_enabled(void) +{ + return false; +} +#endif #endif /* !__ASSEMBLY__ */ /* The kernel use the constants below to index in the page sizes array. @@ -356,6 +367,8 @@ extern void early_init_mmu_secondary(void); extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); static inline void mmu_early_init_devtree(void) { } + +extern void *abatron_pteptrs[2]; #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index bd9ba8defd72..84b4cfe73edd 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -14,4 +14,6 @@ extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace #endif +extern void hv_nmi_check_nonrecoverable(struct pt_regs *regs); + #endif /* _ASM_NMI_H */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index b0f764c827c0..0a1a3fc54e54 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -231,9 +231,10 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) } /* patch sites */ -extern s32 patch__itlbmiss_linmem_top; +extern s32 patch__itlbmiss_linmem_top, patch__itlbmiss_linmem_top8; extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp; extern s32 patch__fixupdar_linmem_top; +extern s32 patch__dtlbmiss_romem_top, patch__dtlbmiss_romem_top8; extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2; extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, patch__dtlbmiss_exit_3; diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 5c5ea2413413..ed870468ef6f 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -20,20 +20,11 @@ /* * On regular PPC32 page size is 4K (but we support 4K/16K/64K/256K pages - * on PPC44x). For PPC64 we support either 4K or 64K software + * on PPC44x and 4K/16K on 8xx). For PPC64 we support either 4K or 64K software * page size. When using 64K pages however, whether we are really supporting * 64K pages in HW or not is irrelevant to those definitions. */ -#if defined(CONFIG_PPC_256K_PAGES) -#define PAGE_SHIFT 18 -#elif defined(CONFIG_PPC_64K_PAGES) -#define PAGE_SHIFT 16 -#elif defined(CONFIG_PPC_16K_PAGES) -#define PAGE_SHIFT 14 -#else -#define PAGE_SHIFT 12 -#endif - +#define PAGE_SHIFT CONFIG_PPC_PAGE_SHIFT #define PAGE_SIZE (ASM_CONST(1) << PAGE_SHIFT) #ifndef __ASSEMBLY__ @@ -326,7 +317,6 @@ struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); -extern int page_is_ram(unsigned long pfn); extern int devmem_is_allowed(unsigned long pfn); #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 77fc21278fa2..fc188e0e9179 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -20,6 +20,8 @@ struct device_node; struct pci_controller_ops { void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); + bool (*iommu_bypass_supported)(struct pci_dev *pdev, + u64 mask); int (*probe_mode)(struct pci_bus *bus); @@ -44,9 +46,6 @@ struct pci_controller_ops { void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *pdev); - void (*shutdown)(struct pci_controller *hose); }; @@ -275,6 +274,8 @@ extern int pcibios_map_io_space(struct pci_bus *bus); extern struct pci_controller *pci_find_hose_for_OF_device( struct device_node* node); +extern struct pci_controller *pci_find_controller_for_domain(int domain_nr); + /* Fill up host controller resources from the OF node */ extern void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct device_node *dev, int primary); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 0c72f1897063..6a1861a6301e 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -52,10 +52,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) #ifdef CONFIG_PCI extern void set_pci_dma_ops(const struct dma_map_ops *dma_ops); -extern const struct dma_map_ops *get_pci_dma_ops(void); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) -#define get_pci_dma_ops() NULL #endif #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dad1d27e196d..505550fb2935 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -66,7 +66,6 @@ extern unsigned long empty_zero_page[]; extern pgd_t swapper_pg_dir[]; -int dma_pfn_limit_to_zone(u64 pfn_limit); extern void paging_init(void); /* diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index 2f3ff7a27881..05b552418519 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -23,6 +23,8 @@ extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val); + void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } @@ -40,7 +42,6 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, } static inline void pnv_tm_init(void) { } -static inline void pnv_power9_force_smt4(void) { } #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index f9513ad38fa6..c5698a523bb1 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -326,6 +326,7 @@ #define PPC_INST_ADDI 0x38000000 #define PPC_INST_ADDIS 0x3c000000 #define PPC_INST_ADD 0x7c000214 +#define PPC_INST_ADDC 0x7c000014 #define PPC_INST_SUB 0x7c000050 #define PPC_INST_BLR 0x4e800020 #define PPC_INST_BLRL 0x4e800021 @@ -334,6 +335,9 @@ #define PPC_INST_MULLW 0x7c0001d6 #define PPC_INST_MULHWU 0x7c000016 #define PPC_INST_MULLI 0x1c000000 +#define PPC_INST_MADDHD 0x10000030 +#define PPC_INST_MADDHDU 0x10000031 +#define PPC_INST_MADDLD 0x10000033 #define PPC_INST_DIVWU 0x7c000396 #define PPC_INST_DIVD 0x7c0003d2 #define PPC_INST_RLWINM 0x54000000 @@ -377,6 +381,7 @@ /* macros to insert fields into opcodes */ #define ___PPC_RA(a) (((a) & 0x1f) << 16) #define ___PPC_RB(b) (((b) & 0x1f) << 11) +#define ___PPC_RC(c) (((c) & 0x1f) << 6) #define ___PPC_RS(s) (((s) & 0x1f) << 21) #define ___PPC_RT(t) ___PPC_RS(t) #define ___PPC_R(r) (((r) & 0x1) << 16) @@ -396,7 +401,7 @@ #define __PPC_WS(w) (((w) & 0x1f) << 11) #define __PPC_SH(s) __PPC_WS(s) #define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4)) -#define __PPC_MB(s) (((s) & 0x1f) << 6) +#define __PPC_MB(s) ___PPC_RC(s) #define __PPC_ME(s) (((s) & 0x1f) << 1) #define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20)) #define __PPC_ME64(s) __PPC_MB64(s) @@ -438,6 +443,15 @@ #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ ___PPC_RT(t) | ___PPC_RA(a) | \ ___PPC_RB(b)) +#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHDU | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) #define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index f67da277d652..f191ef0d2a0a 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -53,13 +53,13 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev); struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr); void eeh_slot_error_detail(struct eeh_pe *pe, int severity); int eeh_pci_enable(struct eeh_pe *pe, int function); -int eeh_pe_reset_full(struct eeh_pe *pe); +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed); void eeh_save_bars(struct eeh_dev *edev); int rtas_write_config(struct pci_dn *, int where, int size, u32 val); int rtas_read_config(struct pci_dn *, int where, int size, u32 *val); void eeh_pe_state_mark(struct eeh_pe *pe, int state); void eeh_pe_mark_isolated(struct eeh_pe *pe); -void eeh_pe_state_clear(struct eeh_pe *pe, int state); +void eeh_pe_state_clear(struct eeh_pe *pe, int state, bool include_passed); void eeh_pe_state_mark_with_cfg(struct eeh_pe *pe, int state); void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index ee58526cb6c2..3351bcf42f2d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -40,7 +40,7 @@ #ifndef __ASSEMBLY__ #include <linux/types.h> -#include <asm/thread_info.h> +#include <linux/thread_info.h> #include <asm/ptrace.h> #include <asm/hw_breakpoint.h> @@ -77,105 +77,15 @@ extern int _chrp_type; #ifdef __KERNEL__ -struct task_struct; -void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); -void release_thread(struct task_struct *); - -#ifdef CONFIG_PPC32 - -#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START -#error User TASK_SIZE overlaps with KERNEL_START address -#endif -#define TASK_SIZE (CONFIG_TASK_SIZE) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) -#endif - #ifdef CONFIG_PPC64 -/* - * 64-bit user address space can have multiple limits - * For now supported values are: - */ -#define TASK_SIZE_64TB (0x0000400000000000UL) -#define TASK_SIZE_128TB (0x0000800000000000UL) -#define TASK_SIZE_512TB (0x0002000000000000UL) -#define TASK_SIZE_1PB (0x0004000000000000UL) -#define TASK_SIZE_2PB (0x0008000000000000UL) -/* - * With 52 bits in the address we can support - * upto 4PB of range. - */ -#define TASK_SIZE_4PB (0x0010000000000000UL) - -/* - * For now 512TB is only supported with book3s and 64K linux page size. - */ -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) -/* - * Max value currently used: - */ -#define TASK_SIZE_USER64 TASK_SIZE_4PB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB -#define TASK_CONTEXT_SIZE TASK_SIZE_512TB +#include <asm/task_size_64.h> #else -#define TASK_SIZE_USER64 TASK_SIZE_64TB -#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB -/* - * We don't need to allocate extended context ids for 4K page size, because - * we limit the max effective address on this config to 64TB. - */ -#define TASK_CONTEXT_SIZE TASK_SIZE_64TB +#include <asm/task_size_32.h> #endif -/* - * 32-bit user address space is 4GB - 1 page - * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT - */ -#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) - -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ - TASK_SIZE_USER32 : TASK_SIZE_USER64) -#define TASK_SIZE TASK_SIZE_OF(current) -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) - -#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ - TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) -#endif - -/* - * Initial task size value for user applications. For book3s 64 we start - * with 128TB and conditionally enable upto 512TB - */ -#ifdef CONFIG_PPC_BOOK3S_64 -#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ - TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) -#else -#define DEFAULT_MAP_WINDOW TASK_SIZE -#endif - -#ifdef __powerpc64__ - -#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 -#define STACK_TOP_USER32 TASK_SIZE_USER32 - -#define STACK_TOP (is_32bit_task() ? \ - STACK_TOP_USER32 : STACK_TOP_USER64) - -#define STACK_TOP_MAX TASK_SIZE_USER64 - -#else /* __powerpc64__ */ - -#define STACK_TOP TASK_SIZE -#define STACK_TOP_MAX STACK_TOP - -#endif /* __powerpc64__ */ +struct task_struct; +void start_thread(struct pt_regs *regs, unsigned long fdptr, unsigned long sp); +void release_thread(struct task_struct *); typedef struct { unsigned long seg; @@ -250,6 +160,9 @@ struct thread_struct { #ifdef CONFIG_PPC32 void *pgdir; /* root of page-table tree */ unsigned long ksp_limit; /* if ksp <= ksp_limit stack overflow */ +#ifdef CONFIG_PPC_RTAS + unsigned long rtas_sp; /* stack pointer for when in RTAS */ +#endif #endif /* Debug Registers */ struct debug_reg debug; @@ -357,8 +270,7 @@ struct thread_struct { #define ARCH_MIN_TASKALIGN 16 #define INIT_SP (sizeof(init_stack) + (unsigned long) &init_stack) -#define INIT_SP_LIMIT \ - (_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack) +#define INIT_SP_LIMIT ((unsigned long)&init_stack) #ifdef CONFIG_SPE #define SPEFSCR_INIT \ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 0b8a735b6d85..64271e562fed 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -157,7 +157,7 @@ extern int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data); #define current_pt_regs() \ - ((struct pt_regs *)((unsigned long)current_thread_info() + THREAD_SIZE) - 1) + ((struct pt_regs *)((unsigned long)task_stack_page(current) + THREAD_SIZE) - 1) /* * We use the least-significant bit of the trap field to indicate * whether we have saved the full set of registers, or only a diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1c98ef1f2d5b..c5b2aff0ce8e 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1062,7 +1062,7 @@ * - SPRG9 debug exception scratch * * All 32-bit: - * - SPRG3 current thread_info pointer + * - SPRG3 current thread_struct physical addr pointer * (virtual on BookE, physical on others) * * 32-bit classic: @@ -1167,7 +1167,7 @@ #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 -#define SPRN_SPRG_RTAS SPRN_SPRG2 +#define SPRN_SPRG_PGDIR SPRN_SPRG2 #define SPRN_SPRG_603_LRU SPRN_SPRG4 #endif @@ -1425,6 +1425,11 @@ static inline void msr_check_and_clear(unsigned long bits) #define mfsrin(v) ({unsigned int rval; \ asm volatile("mfsrin %0,%1" : "=r" (rval) : "r" (v)); \ rval;}) + +static inline void mtsrin(u32 val, u32 idx) +{ + asm volatile("mtsrin %0, %1" : : "r" (val), "r" (idx)); +} #endif #define proc_trap() asm volatile("trap") diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index e335a8f846af..4a1664a8658d 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -17,6 +17,13 @@ extern char __end_interrupts[]; extern char __prom_init_toc_start[]; extern char __prom_init_toc_end[]; +#ifdef CONFIG_PPC_POWERNV +extern char start_real_trampolines[]; +extern char end_real_trampolines[]; +extern char start_virt_trampolines[]; +extern char end_virt_trampolines[]; +#endif + static inline int in_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 41695745032c..0de717e16dd6 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -83,7 +83,22 @@ int is_cpu_dead(unsigned int cpu); /* 32-bit */ extern int smp_hw_index[]; -#define raw_smp_processor_id() (current_thread_info()->cpu) +/* + * This is particularly ugly: it appears we can't actually get the definition + * of task_struct here, but we need access to the CPU this task is running on. + * Instead of using task_struct we're using _TASK_CPU which is extracted from + * asm-offsets.h by kbuild to get the current processor ID. + * + * This also needs to be safeguarded when building asm-offsets.s because at + * that time _TASK_CPU is not defined yet. It could have been guarded by + * _TASK_CPU itself, but we want the build to fail if _TASK_CPU is missing + * when building something else than asm-offsets.s + */ +#ifdef GENERATING_ASM_OFFSETS +#define raw_smp_processor_id() (0) +#else +#define raw_smp_processor_id() (*(unsigned int *)((void *)current + _TASK_CPU)) +#endif #define hard_smp_processor_id() (smp_hw_index[smp_processor_id()]) static inline int get_hard_smp_processor_id(int cpu) diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index f65ecf57b66c..b7d082c0ec25 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -13,12 +13,7 @@ #include <linux/swiotlb.h> -extern const struct dma_map_ops powerpc_swiotlb_dma_ops; - extern unsigned int ppc_swiotlb_enable; -int __init swiotlb_setup_bus_notifier(void); - -extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev); #ifdef CONFIG_SWIOTLB void swiotlb_detect_4g(void); diff --git a/arch/powerpc/include/asm/task_size_32.h b/arch/powerpc/include/asm/task_size_32.h new file mode 100644 index 000000000000..de7290ee770f --- /dev/null +++ b/arch/powerpc/include/asm/task_size_32.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_32_H +#define _ASM_POWERPC_TASK_SIZE_32_H + +#if CONFIG_TASK_SIZE > CONFIG_KERNEL_START +#error User TASK_SIZE overlaps with KERNEL_START address +#endif + +#define TASK_SIZE (CONFIG_TASK_SIZE) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE (TASK_SIZE / 8 * 3) + +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#endif /* _ASM_POWERPC_TASK_SIZE_32_H */ diff --git a/arch/powerpc/include/asm/task_size_64.h b/arch/powerpc/include/asm/task_size_64.h new file mode 100644 index 000000000000..eab4779f6b84 --- /dev/null +++ b/arch/powerpc/include/asm/task_size_64.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_TASK_SIZE_64_H +#define _ASM_POWERPC_TASK_SIZE_64_H + +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_128TB (0x0000800000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) +#define TASK_SIZE_1PB (0x0004000000000000UL) +#define TASK_SIZE_2PB (0x0008000000000000UL) + +/* + * With 52 bits in the address we can support up to 4PB of range. + */ +#define TASK_SIZE_4PB (0x0010000000000000UL) + +/* + * For now 512TB is only supported with book3s and 64K linux page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_64K_PAGES) +/* + * Max value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_4PB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_128TB +#define TASK_CONTEXT_SIZE TASK_SIZE_512TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#define DEFAULT_MAP_WINDOW_USER64 TASK_SIZE_64TB + +/* + * We don't need to allocate extended context ids for 4K page size, because we + * limit the max effective address on this config to 64TB. + */ +#define TASK_CONTEXT_SIZE TASK_SIZE_64TB +#endif + +/* + * 32-bit user address space is 4GB - 1 page + * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT + */ +#define TASK_SIZE_USER32 (0x0000000100000000UL - (1 * PAGE_SIZE)) + +#define TASK_SIZE_OF(tsk) \ + (test_tsk_thread_flag(tsk, TIF_32BIT) ? TASK_SIZE_USER32 : \ + TASK_SIZE_USER64) + +#define TASK_SIZE TASK_SIZE_OF(current) + +#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(DEFAULT_MAP_WINDOW_USER64 / 4)) + +/* + * This decides where the kernel will search for a free chunk of vm space during + * mmap's. + */ +#define TASK_UNMAPPED_BASE \ + ((is_32bit_task()) ? TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64) + +/* + * Initial task size value for user applications. For book3s 64 we start + * with 128TB and conditionally enable upto 512TB + */ +#ifdef CONFIG_PPC_BOOK3S_64 +#define DEFAULT_MAP_WINDOW \ + ((is_32bit_task()) ? TASK_SIZE_USER32 : DEFAULT_MAP_WINDOW_USER64) +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#endif + +#define STACK_TOP_USER64 DEFAULT_MAP_WINDOW_USER64 +#define STACK_TOP_USER32 TASK_SIZE_USER32 +#define STACK_TOP_MAX TASK_SIZE_USER64 +#define STACK_TOP (is_32bit_task() ? STACK_TOP_USER32 : STACK_TOP_USER64) + +#endif /* _ASM_POWERPC_TASK_SIZE_64_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 544cac0474cb..8e1d0195ac36 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -17,12 +17,6 @@ #define THREAD_SIZE (1 << THREAD_SHIFT) -#ifdef CONFIG_PPC64 -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(clrrdi dest, sp, THREAD_SHIFT) -#else -#define CURRENT_THREAD_INFO(dest, sp) stringify_in_c(rlwinm dest, sp, 0, 0, 31-THREAD_SHIFT) -#endif - #ifndef __ASSEMBLY__ #include <linux/cache.h> #include <asm/processor.h> @@ -34,8 +28,6 @@ * low level task data. */ struct thread_info { - struct task_struct *task; /* main task structure */ - int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ unsigned long local_flags; /* private flags for thread */ @@ -58,8 +50,6 @@ struct thread_info { */ #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .flags = 0, \ } @@ -67,15 +57,6 @@ struct thread_info { #define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) /* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long val; - - asm (CURRENT_THREAD_INFO(%0,1) : "=r" (val)); - - return (struct thread_info *)val; -} - extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a4a718dbfec6..f85e2b01c3df 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -132,6 +132,8 @@ static inline void shared_proc_topology_init(void) {} #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) + +int dlpar_cpu_readd(int cpu); #endif #endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index cb7f0bb9ee71..cddadccf551d 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -36,7 +36,7 @@ obj-y := cputable.o ptrace.o syscalls.o \ process.o systbl.o idle.o \ signal.o sysfs.o cacheinfo.o time.o \ prom.o traps.o setup-common.o \ - udbg.o misc.o io.o dma.o misc_$(BITS).o \ + udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ @@ -105,6 +105,7 @@ obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o +obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(BITS).o $(pci64-y) \ @@ -142,19 +143,29 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n +KCOV_INSTRUMENT_prom_init.o := n UBSAN_SANITIZE_prom_init.o := n GCOV_PROFILE_machine_kexec_64.o := n +KCOV_INSTRUMENT_machine_kexec_64.o := n UBSAN_SANITIZE_machine_kexec_64.o := n GCOV_PROFILE_machine_kexec_32.o := n +KCOV_INSTRUMENT_machine_kexec_32.o := n UBSAN_SANITIZE_machine_kexec_32.o := n GCOV_PROFILE_kprobes.o := n +KCOV_INSTRUMENT_kprobes.o := n UBSAN_SANITIZE_kprobes.o := n GCOV_PROFILE_kprobes-ftrace.o := n +KCOV_INSTRUMENT_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_vdso.o := n +# Necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_cputable.o := n +KCOV_INSTRUMENT_setup_64.o := n +KCOV_INSTRUMENT_paca.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9ffc72ded73a..86a61e5f8285 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -13,6 +13,8 @@ * 2 of the License, or (at your option) any later version. */ +#define GENERATING_ASM_OFFSETS /* asm/smp.h */ + #include <linux/compat.h> #include <linux/signal.h> #include <linux/sched.h> @@ -90,10 +92,15 @@ int main(void) DEFINE(SIGSEGV, SIGSEGV); DEFINE(NMI_MASK, NMI_MASK); #else - OFFSET(THREAD_INFO, task_struct, stack); - DEFINE(THREAD_INFO_GAP, _ALIGN_UP(sizeof(struct thread_info), 16)); OFFSET(KSP_LIMIT, thread_struct, ksp_limit); +#ifdef CONFIG_PPC_RTAS + OFFSET(RTAS_SP, thread_struct, rtas_sp); +#endif #endif /* CONFIG_PPC64 */ + OFFSET(TASK_STACK, task_struct, stack); +#ifdef CONFIG_SMP + OFFSET(TASK_CPU, task_struct, cpu); +#endif #ifdef CONFIG_LIVEPATCH OFFSET(TI_livepatch_sp, thread_info, livepatch_sp); @@ -161,8 +168,6 @@ int main(void) OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); OFFSET(TI_PREEMPT, thread_info, preempt_count); - OFFSET(TI_TASK, thread_info, task); - OFFSET(TI_CPU, thread_info, cpu); #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); @@ -177,6 +182,8 @@ int main(void) OFFSET(PACAPROCSTART, paca_struct, cpu_start); OFFSET(PACAKSAVE, paca_struct, kstack); OFFSET(PACACURRENT, paca_struct, __current); + DEFINE(PACA_THREAD_INFO, offsetof(struct paca_struct, __current) + + offsetof(struct task_struct, thread_info)); OFFSET(PACASAVEDMSR, paca_struct, saved_msr); OFFSET(PACAR1, paca_struct, saved_r1); OFFSET(PACATOC, paca_struct, kernel_toc); diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 8c069e96c478..6f1c11e0691f 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -24,6 +24,10 @@ BEGIN_MMU_FTR_SECTION li r10,0 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) + lis r10, (swapper_pg_dir - PAGE_OFFSET)@h + ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r10 + BEGIN_FTR_SECTION bl __init_fpu_registers END_FTR_SECTION_IFCLR(CPU_FTR_FPU_UNAVAILABLE) diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 9c9bcaae2f75..09231ef06d01 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -6,12 +6,31 @@ * busses using the iommu infrastructure */ +#include <linux/dma-direct.h> +#include <linux/pci.h> #include <asm/iommu.h> /* * Generic iommu implementation */ +/* + * The coherent mask may be smaller than the real mask, check if we can + * really use a direct window. + */ +static inline bool dma_iommu_alloc_bypass(struct device *dev) +{ + return dev->archdata.iommu_bypass && !iommu_fixed_is_weak && + dma_direct_supported(dev, dev->coherent_dma_mask); +} + +static inline bool dma_iommu_map_bypass(struct device *dev, + unsigned long attrs) +{ + return dev->archdata.iommu_bypass && + (!iommu_fixed_is_weak || (attrs & DMA_ATTR_WEAK_ORDERING)); +} + /* Allocates a contiguous real buffer and creates mappings over it. * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page. @@ -20,6 +39,8 @@ static void *dma_iommu_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { + if (dma_iommu_alloc_bypass(dev)) + return dma_direct_alloc(dev, size, dma_handle, flag, attrs); return iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, dma_handle, dev->coherent_dma_mask, flag, dev_to_node(dev)); @@ -29,7 +50,11 @@ static void dma_iommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); + if (dma_iommu_alloc_bypass(dev)) + dma_direct_free(dev, size, vaddr, dma_handle, attrs); + else + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, + dma_handle); } /* Creates TCEs for a user provided buffer. The user buffer must be @@ -42,6 +67,9 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_direct_map_page(dev, page, offset, size, direction, + attrs); return iommu_map_page(dev, get_iommu_table_base(dev), page, offset, size, device_to_mask(dev), direction, attrs); } @@ -51,8 +79,9 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction, unsigned long attrs) { - iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction, - attrs); + if (!dma_iommu_map_bypass(dev, attrs)) + iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, + direction, attrs); } @@ -60,6 +89,8 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { + if (dma_iommu_map_bypass(dev, attrs)) + return dma_direct_map_sg(dev, sglist, nelems, direction, attrs); return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems, device_to_mask(dev), direction, attrs); } @@ -68,10 +99,20 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, unsigned long attrs) { - ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, + if (!dma_iommu_map_bypass(dev, attrs)) + ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems, direction, attrs); } +static bool dma_iommu_bypass_supported(struct device *dev, u64 mask) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_controller *phb = pci_bus_to_host(pdev->bus); + + return phb->controller_ops.iommu_bypass_supported && + phb->controller_ops.iommu_bypass_supported(pdev, mask); +} + /* We support DMA to/from any memory page via the iommu */ int dma_iommu_dma_supported(struct device *dev, u64 mask) { @@ -83,32 +124,48 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) return 0; } + if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { + dev->archdata.iommu_bypass = true; + dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); + return 1; + } + if (tbl->it_offset > (mask >> tbl->it_page_shift)) { dev_info(dev, "Warning: IOMMU offset too big for device mask\n"); dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n", mask, tbl->it_offset << tbl->it_page_shift); return 0; - } else - return 1; + } + + dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); + dev->archdata.iommu_bypass = false; + return 1; } -static u64 dma_iommu_get_required_mask(struct device *dev) +u64 dma_iommu_get_required_mask(struct device *dev) { struct iommu_table *tbl = get_iommu_table_base(dev); u64 mask; + if (!tbl) return 0; + if (dev_is_pci(dev)) { + u64 bypass_mask = dma_direct_get_required_mask(dev); + + if (dma_iommu_bypass_supported(dev, bypass_mask)) + return bypass_mask; + } + mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); mask += mask - 1; return mask; } -struct dma_map_ops dma_iommu_ops = { +const struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = dma_iommu_map_sg, .unmap_sg = dma_iommu_unmap_sg, .dma_supported = dma_iommu_dma_supported, diff --git a/arch/powerpc/kernel/dma-mask.c b/arch/powerpc/kernel/dma-mask.c new file mode 100644 index 000000000000..ffbbbc432612 --- /dev/null +++ b/arch/powerpc/kernel/dma-mask.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/dma-mapping.h> +#include <linux/export.h> +#include <asm/machdep.h> + +void arch_dma_set_mask(struct device *dev, u64 dma_mask) +{ + if (ppc_md.dma_set_mask) + ppc_md.dma_set_mask(dev, dma_mask); +} +EXPORT_SYMBOL(arch_dma_set_mask); diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index 7d5fc9751622..132d61c91629 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -10,101 +10,12 @@ * option) any later version. * */ - -#include <linux/dma-direct.h> #include <linux/memblock.h> -#include <linux/pfn.h> -#include <linux/of_platform.h> -#include <linux/platform_device.h> -#include <linux/pci.h> - #include <asm/machdep.h> #include <asm/swiotlb.h> -#include <asm/dma.h> unsigned int ppc_swiotlb_enable; -static u64 swiotlb_powerpc_get_required(struct device *dev) -{ - u64 end, mask, max_direct_dma_addr = dev->archdata.max_direct_dma_addr; - - end = memblock_end_of_DRAM(); - if (max_direct_dma_addr && end > max_direct_dma_addr) - end = max_direct_dma_addr; - end += get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -/* - * At the moment, all platforms that use this code only require - * swiotlb to be used if we're operating on HIGHMEM. Since - * we don't ever call anything other than map_sg, unmap_sg, - * map_page, and unmap_page on highmem, use normal dma_ops - * for everything else. - */ -const struct dma_map_ops powerpc_swiotlb_dma_ops = { - .alloc = __dma_nommu_alloc_coherent, - .free = __dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, - .map_sg = dma_direct_map_sg, - .unmap_sg = dma_direct_unmap_sg, - .dma_supported = swiotlb_dma_supported, - .map_page = dma_direct_map_page, - .unmap_page = dma_direct_unmap_page, - .sync_single_for_cpu = dma_direct_sync_single_for_cpu, - .sync_single_for_device = dma_direct_sync_single_for_device, - .sync_sg_for_cpu = dma_direct_sync_sg_for_cpu, - .sync_sg_for_device = dma_direct_sync_sg_for_device, - .get_required_mask = swiotlb_powerpc_get_required, -}; - -void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) -{ - struct pci_controller *hose; - struct dev_archdata *sd; - - hose = pci_bus_to_host(pdev->bus); - sd = &pdev->dev.archdata; - sd->max_direct_dma_addr = - hose->dma_window_base_cur + hose->dma_window_size; -} - -static int ppc_swiotlb_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - struct dev_archdata *sd; - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - sd = &dev->archdata; - sd->max_direct_dma_addr = 0; - - /* May need to bounce if the device can't address all of DRAM */ - if ((dma_get_mask(dev) + 1) < memblock_end_of_DRAM()) - set_dma_ops(dev, &powerpc_swiotlb_dma_ops); - - return NOTIFY_DONE; -} - -static struct notifier_block ppc_swiotlb_plat_bus_notifier = { - .notifier_call = ppc_swiotlb_bus_notify, - .priority = 0, -}; - -int __init swiotlb_setup_bus_notifier(void) -{ - bus_register_notifier(&platform_bus_type, - &ppc_swiotlb_plat_bus_notifier); - return 0; -} - void __init swiotlb_detect_4g(void) { if ((memblock_end_of_DRAM() - 1) > 0xffffffff) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c deleted file mode 100644 index b1903ebb2e9c..000000000000 --- a/arch/powerpc/kernel/dma.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (C) 2006 Benjamin Herrenschmidt, IBM Corporation - * - * Provide default implementations of the DMA mapping callbacks for - * directly mapped busses. - */ - -#include <linux/device.h> -#include <linux/dma-mapping.h> -#include <linux/dma-debug.h> -#include <linux/gfp.h> -#include <linux/memblock.h> -#include <linux/export.h> -#include <linux/pci.h> -#include <asm/vio.h> -#include <asm/bug.h> -#include <asm/machdep.h> -#include <asm/swiotlb.h> -#include <asm/iommu.h> - -/* - * Generic direct DMA implementation - * - * This implementation supports a per-device offset that can be applied if - * the address at which memory is visible to devices is not 0. Platform code - * can set archdata.dma_data to an unsigned long holding the offset. By - * default the offset is PCI_DRAM_OFFSET. - */ - -static u64 __maybe_unused get_pfn_limit(struct device *dev) -{ - u64 pfn = (dev->coherent_dma_mask >> PAGE_SHIFT) + 1; - struct dev_archdata __maybe_unused *sd = &dev->archdata; - -#ifdef CONFIG_SWIOTLB - if (sd->max_direct_dma_addr && dev->dma_ops == &powerpc_swiotlb_dma_ops) - pfn = min_t(u64, pfn, sd->max_direct_dma_addr >> PAGE_SHIFT); -#endif - - return pfn; -} - -static int dma_nommu_dma_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_PPC64 - u64 limit = get_dma_offset(dev) + (memblock_end_of_DRAM() - 1); - - /* Limit fits in the mask, we are good */ - if (mask >= limit) - return 1; - -#ifdef CONFIG_FSL_SOC - /* - * Freescale gets another chance via ZONE_DMA, however - * that will have to be refined if/when they support iommus - */ - return 1; -#endif - /* Sorry ... */ - return 0; -#else - return 1; -#endif -} - -#ifndef CONFIG_NOT_COHERENT_CACHE -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - void *ret; - struct page *page; - int node = dev_to_node(dev); -#ifdef CONFIG_FSL_SOC - u64 pfn = get_pfn_limit(dev); - int zone; - - /* - * This code should be OK on other platforms, but we have drivers that - * don't set coherent_dma_mask. As a workaround we just ifdef it. This - * whole routine needs some serious cleanup. - */ - - zone = dma_pfn_limit_to_zone(pfn); - if (zone < 0) { - dev_err(dev, "%s: No suitable zone for pfn %#llx\n", - __func__, pfn); - return NULL; - } - - switch (zone) { -#ifdef CONFIG_ZONE_DMA - case ZONE_DMA: - flag |= GFP_DMA; - break; -#endif - }; -#endif /* CONFIG_FSL_SOC */ - - page = alloc_pages_node(node, flag, get_order(size)); - if (page == NULL) - return NULL; - ret = page_address(page); - memset(ret, 0, size); - *dma_handle = __pa(ret) + get_dma_offset(dev); - - return ret; -} - -void __dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} -#endif /* !CONFIG_NOT_COHERENT_CACHE */ - -static void *dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* The coherent mask may be smaller than the real mask, check if - * we can really use the direct ops - */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_alloc_coherent(dev, size, dma_handle, - flag, attrs); - - /* Ok we can't ... do we have an iommu ? If not, fail */ - iommu = get_iommu_table_base(dev); - if (!iommu) - return NULL; - - /* Try to use the iommu */ - return iommu_alloc_coherent(dev, iommu, size, dma_handle, - dev->coherent_dma_mask, flag, - dev_to_node(dev)); -} - -static void dma_nommu_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - struct iommu_table *iommu; - - /* See comments in dma_nommu_alloc_coherent() */ - if (dma_nommu_dma_supported(dev, dev->coherent_dma_mask)) - return __dma_nommu_free_coherent(dev, size, vaddr, dma_handle, - attrs); - /* Maybe we used an iommu ... */ - iommu = get_iommu_table_base(dev); - - /* If we hit that we should have never allocated in the first - * place so how come we are freeing ? - */ - if (WARN_ON(!iommu)) - return; - iommu_free_coherent(iommu, size, vaddr, dma_handle); -} - -int dma_nommu_mmap_coherent(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) -{ - unsigned long pfn; - -#ifdef CONFIG_NOT_COHERENT_CACHE - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - pfn = __dma_get_coherent_pfn((unsigned long)cpu_addr); -#else - pfn = page_to_pfn(virt_to_page(cpu_addr)); -#endif - return remap_pfn_range(vma, vma->vm_start, - pfn + vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - -static int dma_nommu_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) { - sg->dma_address = sg_phys(sg) + get_dma_offset(dev); - sg->dma_length = sg->length; - - if (attrs & DMA_ATTR_SKIP_CPU_SYNC) - continue; - - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); - } - - return nents; -} - -static void dma_nommu_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -static u64 dma_nommu_get_required_mask(struct device *dev) -{ - u64 end, mask; - - end = memblock_end_of_DRAM() + get_dma_offset(dev); - - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -static inline dma_addr_t dma_nommu_map_page(struct device *dev, - struct page *page, - unsigned long offset, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync_page(page, offset, size, dir); - - return page_to_phys(page) + offset + get_dma_offset(dev); -} - -static inline void dma_nommu_unmap_page(struct device *dev, - dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - __dma_sync(bus_to_virt(dma_address), size, direction); -} - -#ifdef CONFIG_NOT_COHERENT_CACHE -static inline void dma_nommu_sync_sg(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); -} - -static inline void dma_nommu_sync_single(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - __dma_sync(bus_to_virt(dma_handle), size, direction); -} -#endif - -const struct dma_map_ops dma_nommu_ops = { - .alloc = dma_nommu_alloc_coherent, - .free = dma_nommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, - .map_sg = dma_nommu_map_sg, - .unmap_sg = dma_nommu_unmap_sg, - .dma_supported = dma_nommu_dma_supported, - .map_page = dma_nommu_map_page, - .unmap_page = dma_nommu_unmap_page, - .get_required_mask = dma_nommu_get_required_mask, -#ifdef CONFIG_NOT_COHERENT_CACHE - .sync_single_for_cpu = dma_nommu_sync_single, - .sync_single_for_device = dma_nommu_sync_single, - .sync_sg_for_cpu = dma_nommu_sync_sg, - .sync_sg_for_device = dma_nommu_sync_sg, -#endif -}; -EXPORT_SYMBOL(dma_nommu_ops); - -int dma_set_coherent_mask(struct device *dev, u64 mask) -{ - if (!dma_supported(dev, mask)) { - /* - * We need to special case the direct DMA ops which can - * support a fallback for coherent allocations. There - * is no dma_op->set_coherent_mask() so we have to do - * things the hard way: - */ - if (get_dma_ops(dev) != &dma_nommu_ops || - get_iommu_table_base(dev) == NULL || - !dma_iommu_dma_supported(dev, mask)) - return -EIO; - } - dev->coherent_dma_mask = mask; - return 0; -} -EXPORT_SYMBOL(dma_set_coherent_mask); - -int dma_set_mask(struct device *dev, u64 dma_mask) -{ - if (ppc_md.dma_set_mask) - return ppc_md.dma_set_mask(dev, dma_mask); - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_set_mask) - return phb->controller_ops.dma_set_mask(pdev, dma_mask); - } - - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} -EXPORT_SYMBOL(dma_set_mask); - -u64 __dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (unlikely(dma_ops == NULL)) - return 0; - - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - return DMA_BIT_MASK(8 * sizeof(dma_addr_t)); -} - -u64 dma_get_required_mask(struct device *dev) -{ - if (ppc_md.dma_get_required_mask) - return ppc_md.dma_get_required_mask(dev); - - if (dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct pci_controller *phb = pci_bus_to_host(pdev->bus); - if (phb->controller_ops.dma_get_required_mask) - return phb->controller_ops.dma_get_required_mask(pdev); - } - - return __dma_get_required_mask(dev); -} -EXPORT_SYMBOL_GPL(dma_get_required_mask); - -static int __init dma_init(void) -{ -#ifdef CONFIG_IBMVIO - dma_debug_add_bus(&vio_bus_type); -#endif - - return 0; -} -fs_initcall(dma_init); - diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 8be3721d9302..e49bd5efcfe6 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -666,8 +666,10 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) m = &dt_cpu_feature_match_table[i]; if (!strcmp(f->name, m->name)) { known = true; - if (m->enable(f)) + if (m->enable(f)) { + cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; break; + } pr_info("not enabling: %s (disabled or unsupported by kernel)\n", f->name); @@ -675,17 +677,12 @@ static bool __init cpufeatures_process_feature(struct dt_cpu_feature *f) } } - if (!known && enable_unknown) { - if (!feat_try_enable_unknown(f)) { - pr_info("not enabling: %s (unknown and unsupported by kernel)\n", - f->name); - return false; - } + if (!known && (!enable_unknown || !feat_try_enable_unknown(f))) { + pr_info("not enabling: %s (unknown and unsupported by kernel)\n", + f->name); + return false; } - if (m->cpu_ftr_bit_mask) - cur_cpu_spec->cpu_features |= m->cpu_ftr_bit_mask; - if (known) pr_debug("enabling: %s\n", f->name); else diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ae05203eb4de..289c0b37d845 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -109,7 +109,14 @@ EXPORT_SYMBOL(eeh_subsystem_flags); * frozen count in last hour exceeds this limit, the PE will * be forced to be offline permanently. */ -int eeh_max_freezes = 5; +u32 eeh_max_freezes = 5; + +/* + * Controls whether a recovery event should be scheduled when an + * isolated device is discovered. This is only really useful for + * debugging problems with the EEH core. + */ +bool eeh_debugfs_no_recover; /* Platform dependent EEH operations */ struct eeh_ops *eeh_ops = NULL; @@ -823,15 +830,15 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat switch (state) { case pcie_deassert_reset: eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); - eeh_unfreeze_pe(pe, false); + eeh_unfreeze_pe(pe); if (!(pe->type & EEH_PE_VF)) - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev); - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); break; case pcie_hot_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -840,7 +847,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat break; case pcie_warm_reset: eeh_pe_mark_isolated(pe); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, true); eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev); if (!(pe->type & EEH_PE_VF)) @@ -848,7 +855,7 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); break; default: - eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED, true); return -EINVAL; }; @@ -877,6 +884,24 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) return NULL; } +static void eeh_pe_refreeze_passed(struct eeh_pe *root) +{ + struct eeh_pe *pe; + int state; + + eeh_for_each_pe(root, pe) { + if (eeh_pe_passed(pe)) { + state = eeh_ops->get_state(pe, NULL); + if (state & + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_MMIO_ENABLED)) { + pr_info("EEH: Passed-through PE PHB#%x-PE#%x was thawed by reset, re-freezing for safety.\n", + pe->phb->global_number, pe->addr); + eeh_pe_set_option(pe, EEH_OPT_FREEZE_PE); + } + } + } +} + /** * eeh_pe_reset_full - Complete a full reset process on the indicated PE * @pe: EEH PE @@ -889,12 +914,12 @@ static void *eeh_set_dev_freset(struct eeh_dev *edev, void *flag) * * This function will attempt to reset a PE three times before failing. */ -int eeh_pe_reset_full(struct eeh_pe *pe) +int eeh_pe_reset_full(struct eeh_pe *pe, bool include_passed) { int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED); int type = EEH_RESET_HOT; unsigned int freset = 0; - int i, state, ret; + int i, state = 0, ret; /* * Determine the type of reset to perform - hot or fundamental. @@ -911,32 +936,42 @@ int eeh_pe_reset_full(struct eeh_pe *pe) /* Make three attempts at resetting the bus */ for (i = 0; i < 3; i++) { - ret = eeh_pe_reset(pe, type); - if (ret) - break; - - ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); - if (ret) - break; + ret = eeh_pe_reset(pe, type, include_passed); + if (!ret) + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, + include_passed); + if (ret) { + ret = -EIO; + pr_warn("EEH: Failure %d resetting PHB#%x-PE#%x (attempt %d)\n\n", + state, pe->phb->global_number, pe->addr, i + 1); + continue; + } + if (i) + pr_warn("EEH: PHB#%x-PE#%x: Successful reset (attempt %d)\n", + pe->phb->global_number, pe->addr, i + 1); /* Wait until the PE is in a functioning state */ state = eeh_wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); if (state < 0) { - pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x", - __func__, pe->phb->global_number, pe->addr); + pr_warn("EEH: Unrecoverable slot failure on PHB#%x-PE#%x", + pe->phb->global_number, pe->addr); ret = -ENOTRECOVERABLE; break; } if (eeh_state_active(state)) break; - - /* Set error in case this is our last attempt */ - ret = -EIO; - pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n", - __func__, state, pe->phb->global_number, pe->addr, (i + 1)); + else + pr_warn("EEH: PHB#%x-PE#%x: Slot inactive after reset: 0x%x (attempt %d)\n", + pe->phb->global_number, pe->addr, state, i + 1); } - eeh_pe_state_clear(pe, reset_state); + /* Resetting the PE may have unfrozen child PEs. If those PEs have been + * (potentially) passed through to a guest, re-freeze them: + */ + if (!include_passed) + eeh_pe_refreeze_passed(pe); + + eeh_pe_state_clear(pe, reset_state, true); return ret; } @@ -1309,7 +1344,7 @@ void eeh_remove_device(struct pci_dev *dev) edev->mode &= ~EEH_DEV_SYSFS; } -int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) +int eeh_unfreeze_pe(struct eeh_pe *pe) { int ret; @@ -1327,10 +1362,6 @@ int eeh_unfreeze_pe(struct eeh_pe *pe, bool sw_state) return ret; } - /* Clear software isolated state */ - if (sw_state && (pe->state & EEH_PE_ISOLATED)) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - return ret; } @@ -1382,7 +1413,10 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) } } - return eeh_unfreeze_pe(pe, true); + ret = eeh_unfreeze_pe(pe); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); + return ret; } /** @@ -1612,13 +1646,12 @@ int eeh_pe_get_state(struct eeh_pe *pe) } EXPORT_SYMBOL_GPL(eeh_pe_get_state); -static int eeh_pe_reenable_devices(struct eeh_pe *pe) +static int eeh_pe_reenable_devices(struct eeh_pe *pe, bool include_passed) { struct eeh_dev *edev, *tmp; struct pci_dev *pdev; int ret = 0; - /* Restore config space */ eeh_pe_restore_bars(pe); /* @@ -1639,7 +1672,14 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) } /* The PE is still in frozen state */ - return eeh_unfreeze_pe(pe, true); + if (include_passed || !eeh_pe_passed(pe)) { + ret = eeh_unfreeze_pe(pe); + } else + pr_info("EEH: Note: Leaving passthrough PHB#%x-PE#%x frozen.\n", + pe->phb->global_number, pe->addr); + if (!ret) + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, include_passed); + return ret; } @@ -1652,7 +1692,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe) * indicated type, either fundamental reset or hot reset. * PE reset is the most important part for error recovery. */ -int eeh_pe_reset(struct eeh_pe *pe, int option) +int eeh_pe_reset(struct eeh_pe *pe, int option, bool include_passed) { int ret = 0; @@ -1666,11 +1706,11 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED, include_passed); if (ret) break; - ret = eeh_pe_reenable_devices(pe); + ret = eeh_pe_reenable_devices(pe, include_passed); break; case EEH_RESET_HOT: case EEH_RESET_FUNDAMENTAL: @@ -1796,22 +1836,64 @@ static int eeh_enable_dbgfs_get(void *data, u64 *val) return 0; } -static int eeh_freeze_dbgfs_set(void *data, u64 val) -{ - eeh_max_freezes = val; - return 0; -} +DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, + eeh_enable_dbgfs_set, "0x%llx\n"); -static int eeh_freeze_dbgfs_get(void *data, u64 *val) +static ssize_t eeh_force_recover_write(struct file *filp, + const char __user *user_buf, + size_t count, loff_t *ppos) { - *val = eeh_max_freezes; - return 0; + struct pci_controller *hose; + uint32_t phbid, pe_no; + struct eeh_pe *pe; + char buf[20]; + int ret; + + ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count); + if (!ret) + return -EFAULT; + + /* + * When PE is NULL the event is a "special" event. Rather than + * recovering a specific PE it forces the EEH core to scan for failed + * PHBs and recovers each. This needs to be done before any device + * recoveries can occur. + */ + if (!strncmp(buf, "hwcheck", 7)) { + __eeh_send_failure_event(NULL); + return count; + } + + ret = sscanf(buf, "%x:%x", &phbid, &pe_no); + if (ret != 2) + return -EINVAL; + + hose = pci_find_controller_for_domain(phbid); + if (!hose) + return -ENODEV; + + /* Retrieve PE */ + pe = eeh_pe_get(hose, pe_no, 0); + if (!pe) + return -ENODEV; + + /* + * We don't do any state checking here since the detection + * process is async to the recovery process. The recovery + * thread *should* not break even if we schedule a recovery + * from an odd state (e.g. PE removed, or recovery of a + * non-isolated PE) + */ + __eeh_send_failure_event(pe); + + return ret < 0 ? ret : count; } -DEFINE_DEBUGFS_ATTRIBUTE(eeh_enable_dbgfs_ops, eeh_enable_dbgfs_get, - eeh_enable_dbgfs_set, "0x%llx\n"); -DEFINE_DEBUGFS_ATTRIBUTE(eeh_freeze_dbgfs_ops, eeh_freeze_dbgfs_get, - eeh_freeze_dbgfs_set, "0x%llx\n"); +static const struct file_operations eeh_force_recover_fops = { + .open = simple_open, + .llseek = no_llseek, + .write = eeh_force_recover_write, +}; #endif static int __init eeh_init_proc(void) @@ -1822,9 +1904,15 @@ static int __init eeh_init_proc(void) debugfs_create_file_unsafe("eeh_enable", 0600, powerpc_debugfs_root, NULL, &eeh_enable_dbgfs_ops); - debugfs_create_file_unsafe("eeh_max_freezes", 0600, - powerpc_debugfs_root, NULL, - &eeh_freeze_dbgfs_ops); + debugfs_create_u32("eeh_max_freezes", 0600, + powerpc_debugfs_root, &eeh_max_freezes); + debugfs_create_bool("eeh_disable_recovery", 0600, + powerpc_debugfs_root, + &eeh_debugfs_no_recover); + debugfs_create_file_unsafe("eeh_force_recover", 0600, + powerpc_debugfs_root, NULL, + &eeh_force_recover_fops); + eeh_cache_debugfs_init(); #endif } diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index 201943d54a6e..9c68f0837385 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -26,6 +26,7 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <asm/pci-bridge.h> +#include <asm/debugfs.h> #include <asm/ppc-pci.h> @@ -113,7 +114,7 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) while (n) { struct pci_io_addr_range *piar; piar = rb_entry(n, struct pci_io_addr_range, rb_node); - pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n", + pr_info("PCI: %s addr range %d [%pap-%pap]: %s\n", (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); cnt++; @@ -157,10 +158,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo, piar->pcidev = dev; piar->flags = flags; -#ifdef DEBUG pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n", &alo, &ahi, pci_name(dev)); -#endif rb_link_node(&piar->rb_node, parent, p); rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); @@ -240,6 +239,8 @@ restart: piar = rb_entry(n, struct pci_io_addr_range, rb_node); if (piar->pcidev == dev) { + pr_debug("PIAR: remove range=[%pap:%pap] dev=%s\n", + &piar->addr_lo, &piar->addr_hi, pci_name(dev)); rb_erase(n, &pci_io_addr_cache_root.rb_root); kfree(piar); goto restart; @@ -298,9 +299,30 @@ void eeh_addr_cache_build(void) eeh_addr_cache_insert_dev(dev); eeh_sysfs_add_device(dev); } +} -#ifdef DEBUG - /* Verify tree built up above, echo back the list of addrs. */ - eeh_addr_cache_print(&pci_io_addr_cache_root); -#endif +static int eeh_addr_cache_show(struct seq_file *s, void *v) +{ + struct pci_io_addr_range *piar; + struct rb_node *n; + + spin_lock(&pci_io_addr_cache_root.piar_lock); + for (n = rb_first(&pci_io_addr_cache_root.rb_root); n; n = rb_next(n)) { + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + seq_printf(s, "%s addr range [%pap-%pap]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", + &piar->addr_lo, &piar->addr_hi, pci_name(piar->pcidev)); + } + spin_unlock(&pci_io_addr_cache_root.piar_lock); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); + +void eeh_cache_debugfs_init(void) +{ + debugfs_create_file_unsafe("eeh_address_cache", 0400, + powerpc_debugfs_root, NULL, + &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 99eab7bc7edc..89623962c727 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -510,22 +510,11 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) * support EEH. So we just care about PCI devices for * simplicity here. */ - if (!dev || (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) - return NULL; - - /* - * We rely on count-based pcibios_release_device() to - * detach permanently offlined PEs. Unfortunately, that's - * not reliable enough. We might have the permanently - * offlined PEs attached, but we needn't take care of - * them and their child devices. - */ - if (eeh_dev_removed(edev)) + if (!eeh_edev_actionable(edev) || + (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) return NULL; if (rmv_data) { - if (eeh_pe_passed(edev->pe)) - return NULL; driver = eeh_pcid_get(dev); if (driver) { if (driver->err_handler && @@ -539,8 +528,8 @@ static void *eeh_rmv_device(struct eeh_dev *edev, void *userdata) } /* Remove it from PCI subsystem */ - pr_debug("EEH: Removing %s without EEH sensitive driver\n", - pci_name(dev)); + pr_info("EEH: Removing %s without EEH sensitive driver\n", + pci_name(dev)); edev->mode |= EEH_DEV_DISCONNECTED; if (rmv_data) rmv_data->removed_dev_count++; @@ -591,34 +580,22 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) * PE reset (for 3 times), we try to clear the frozen state * for 3 times as well. */ -static void *__eeh_clear_pe_frozen_state(struct eeh_pe *pe, void *flag) +static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) { - bool clear_sw_state = *(bool *)flag; - int i, rc = 1; - - for (i = 0; rc && i < 3; i++) - rc = eeh_unfreeze_pe(pe, clear_sw_state); + struct eeh_pe *pe; + int i; - /* Stop immediately on any errors */ - if (rc) { - pr_warn("%s: Failure %d unfreezing PHB#%x-PE#%x\n", - __func__, rc, pe->phb->global_number, pe->addr); - return (void *)pe; + eeh_for_each_pe(root, pe) { + if (include_passed || !eeh_pe_passed(pe)) { + for (i = 0; i < 3; i++) + if (!eeh_unfreeze_pe(pe)) + break; + if (i >= 3) + return -EIO; + } } - - return NULL; -} - -static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, - bool clear_sw_state) -{ - void *rc; - - rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, &clear_sw_state); - if (!rc) - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); - - return rc ? -EIO : 0; + eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); + return 0; } int eeh_pe_reset_and_recover(struct eeh_pe *pe) @@ -636,16 +613,16 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); /* Issue reset */ - ret = eeh_pe_reset_full(pe); + ret = eeh_pe_reset_full(pe, true); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } /* Unfreeze the PE */ ret = eeh_clear_pe_frozen_state(pe, true); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return ret; } @@ -653,7 +630,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); /* Clear recovery mode */ - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); return 0; } @@ -676,6 +653,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, time64_t tstamp; int cnt, rc; struct eeh_dev *edev; + struct eeh_pe *tmp_pe; + bool any_passed = false; + + eeh_for_each_pe(pe, tmp_pe) + any_passed |= eeh_pe_passed(tmp_pe); /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; @@ -688,7 +670,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); - if (driver_eeh_aware || (pe->type & EEH_PE_VF)) { + if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } else { pci_lock_rescan_remove(); @@ -705,7 +687,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - rc = eeh_pe_reset_full(pe); + rc = eeh_pe_reset_full(pe, false); if (rc) return rc; @@ -744,11 +726,11 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, eeh_add_virt_device(edev); } else { if (!driver_eeh_aware) - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); pci_hp_add_devices(bus); } } - eeh_pe_state_clear(pe, EEH_PE_KEEP); + eeh_pe_state_clear(pe, EEH_PE_KEEP, true); pe->tstamp = tstamp; pe->freeze_count = cnt; @@ -900,7 +882,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * is still in frozen state. Clear it before * resuming the PE. */ - eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); result = PCI_ERS_RESULT_RECOVERED; } } @@ -977,7 +959,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); @@ -987,7 +969,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe) return; } } - eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); } /** @@ -1069,7 +1051,7 @@ void eeh_handle_special_event(void) continue; /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); eeh_set_channel_state(pe, pci_channel_io_perm_failure); eeh_pe_report( "error_detected(permanent failure)", pe, diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 227e57f980df..539aca055d70 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -121,7 +121,7 @@ int eeh_event_init(void) * the actual event will be delivered in a normal context * (from a workqueue). */ -int eeh_send_failure_event(struct eeh_pe *pe) +int __eeh_send_failure_event(struct eeh_pe *pe) { unsigned long flags; struct eeh_event *event; @@ -144,6 +144,20 @@ int eeh_send_failure_event(struct eeh_pe *pe) return 0; } +int eeh_send_failure_event(struct eeh_pe *pe) +{ + /* + * If we've manually supressed recovery events via debugfs + * then just drop it on the floor. + */ + if (eeh_debugfs_no_recover) { + pr_err("EEH: Event dropped due to no_recover setting\n"); + return 0; + } + + return __eeh_send_failure_event(pe); +} + /** * eeh_remove_event - Remove EEH event from the queue * @pe: Event binding to the PE diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 6fa2032e0594..8b578891f27c 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -657,62 +657,52 @@ void eeh_pe_dev_mode_mark(struct eeh_pe *pe, int mode) } /** - * __eeh_pe_state_clear - Clear state for the PE + * eeh_pe_state_clear - Clear state for the PE * @data: EEH PE - * @flag: state + * @state: state + * @include_passed: include passed-through devices? * * The function is used to clear the indicated state from the * given PE. Besides, we also clear the check count of the PE * as well. */ -static void *__eeh_pe_state_clear(struct eeh_pe *pe, void *flag) +void eeh_pe_state_clear(struct eeh_pe *root, int state, bool include_passed) { - int state = *((int *)flag); + struct eeh_pe *pe; struct eeh_dev *edev, *tmp; struct pci_dev *pdev; - /* Keep the state of permanently removed PE intact */ - if (pe->state & EEH_PE_REMOVED) - return NULL; + eeh_for_each_pe(root, pe) { + /* Keep the state of permanently removed PE intact */ + if (pe->state & EEH_PE_REMOVED) + continue; - pe->state &= ~state; + if (!include_passed && eeh_pe_passed(pe)) + continue; - /* - * Special treatment on clearing isolated state. Clear - * check count since last isolation and put all affected - * devices to normal state. - */ - if (!(state & EEH_PE_ISOLATED)) - return NULL; + pe->state &= ~state; - pe->check_count = 0; - eeh_pe_for_each_dev(pe, edev, tmp) { - pdev = eeh_dev_to_pci_dev(edev); - if (!pdev) + /* + * Special treatment on clearing isolated state. Clear + * check count since last isolation and put all affected + * devices to normal state. + */ + if (!(state & EEH_PE_ISOLATED)) continue; - pdev->error_state = pci_channel_io_normal; - } - - /* Unblock PCI config access if required */ - if (pe->state & EEH_PE_CFG_RESTRICTED) - pe->state &= ~EEH_PE_CFG_BLOCKED; + pe->check_count = 0; + eeh_pe_for_each_dev(pe, edev, tmp) { + pdev = eeh_dev_to_pci_dev(edev); + if (!pdev) + continue; - return NULL; -} + pdev->error_state = pci_channel_io_normal; + } -/** - * eeh_pe_state_clear - Clear state for the PE and its children - * @pe: PE - * @state: state to be cleared - * - * When the PE and its children has been recovered from error, - * we need clear the error state for that. The function is used - * for the purpose. - */ -void eeh_pe_state_clear(struct eeh_pe *pe, int state) -{ - eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); + /* Unblock PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state &= ~EEH_PE_CFG_BLOCKED; + } } /* diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index deed906dd8f1..3fa04dda1737 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -82,8 +82,9 @@ static ssize_t eeh_pe_state_store(struct device *dev, if (!(edev->pe->state & EEH_PE_ISOLATED)) return count; - if (eeh_unfreeze_pe(edev->pe, true)) + if (eeh_unfreeze_pe(edev->pe)) return -EIO; + eeh_pe_state_clear(edev->pe, EEH_PE_ISOLATED, true); return count; } diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0768dfd8a64e..b61cfd29c76f 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -97,14 +97,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,_SRR1(r11) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,SAVED_KSP_LIMIT(r11) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -121,14 +118,11 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,crit_srr1@l(0) - /* set the stack limit to the current stack - * and set the limit to protect the thread_info - * struct - */ + /* set the stack limit to the current stack */ mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,saved_ksp_limit@l(0) - rlwimi r0,r1,0,0,(31-THREAD_SHIFT) + rlwinm r0,r1,0,0,(31 - THREAD_SHIFT) stw r0,KSP_LIMIT(r8) /* fall through */ #endif @@ -157,7 +151,6 @@ transfer_to_handler: stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD - tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ addi r11,r1,STACK_FRAME_OVERHEAD stw r11,PT_REGS(r12) @@ -166,6 +159,9 @@ transfer_to_handler: internal debug mode bit to do this. */ lwz r12,THREAD_DBCR0(r12) andis. r12,r12,DBCR0_IDM@h +#endif + ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ li r12,-1 /* clear all pending debug events */ @@ -174,8 +170,7 @@ transfer_to_handler: tophys(r11,r11) addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -185,11 +180,6 @@ transfer_to_handler: addi r12,r12,-1 stw r12,4(r11) #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - tophys(r9, r9) - ACCOUNT_CPU_USER_ENTRY(r9, r11, r12) -#endif b 3f @@ -201,9 +191,7 @@ transfer_to_handler: ble- stack_ovf /* then the kernel stack overflowed */ 5: #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) - CURRENT_THREAD_INFO(r9, r1) - tophys(r9,r9) /* check local flags */ - lwz r12,TI_LOCAL_FLAGS(r9) + lwz r12,TI_LOCAL_FLAGS(r2) mtcrf 0x01,r12 bt- 31-TLF_NAPPING,4f bt- 31-TLF_SLEEPING,7f @@ -212,6 +200,7 @@ transfer_to_handler: transfer_to_handler_cont: 3: mflr r9 + tovirt(r2, r2) /* set r2 to current */ lwz r11,0(r9) /* virtual address of handler */ lwz r9,4(r9) /* where to go when done */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) @@ -275,11 +264,11 @@ reenable_mmu: /* re-enable mmu so we can */ #if defined (CONFIG_PPC_BOOK3S_32) || defined(CONFIG_E500) 4: rlwinm r12,r12,0,~_TLF_NAPPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) b power_save_ppc32_restore 7: rlwinm r12,r12,0,~_TLF_SLEEPING - stw r12,TI_LOCAL_FLAGS(r9) + stw r12,TI_LOCAL_FLAGS(r2) lwz r9,_MSR(r11) /* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ @@ -351,8 +340,7 @@ _GLOBAL(DoSyscall) mtmsr r11 1: #endif /* CONFIG_TRACE_IRQFLAGS */ - CURRENT_THREAD_INFO(r10, r1) - lwz r11,TI_FLAGS(r10) + lwz r11,TI_FLAGS(r2) andi. r11,r11,_TIF_SYSCALL_DOTRACE bne- syscall_dotrace syscall_dotrace_cont: @@ -385,13 +373,12 @@ ret_from_syscall: lwz r3,GPR3(r1) #endif mr r6,r3 - CURRENT_THREAD_INFO(r12, r1) /* disable interrupts so current_thread_info()->flags can't change */ LOAD_MSR_KERNEL(r10,MSR_KERNEL) /* doesn't include MSR_EE */ /* Note: We don't bother telling lockdep about it */ SYNC MTMSRD(r10) - lwz r9,TI_FLAGS(r12) + lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work @@ -438,8 +425,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE andi. r4,r8,MSR_PR beq 3f - CURRENT_THREAD_INFO(r4, r1) - ACCOUNT_CPU_USER_EXIT(r4, r5, r7) + ACCOUNT_CPU_USER_EXIT(r2, r5, r7) 3: #endif lwz r4,_LINK(r1) @@ -532,7 +518,7 @@ syscall_exit_work: /* Clear per-syscall TIF flags if any are set. */ li r11,_TIF_PERSYSCALL_MASK - addi r12,r12,TI_FLAGS + addi r12,r2,TI_FLAGS 3: lwarx r8,0,r12 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -540,7 +526,6 @@ syscall_exit_work: #endif stwcx. r8,0,r12 bne- 3b - subi r12,r12,TI_FLAGS 4: /* Anything which requires enabling interrupts? */ andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) @@ -745,6 +730,9 @@ fast_exception_return: mtcr r10 lwz r10,_LINK(r11) mtlr r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r11) REST_GPR(10, r11) #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) mtspr SPRN_NRI, r0 @@ -819,8 +807,7 @@ ret_from_except: user_exc_return: /* r10 contains MSR_KERNEL here */ /* Check current_thread_info()->flags */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_USER_WORK_MASK bne do_work @@ -832,18 +819,14 @@ restore_user: andis. r10,r0,DBCR0_IDM@h bnel- load_dbcr0 #endif -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - CURRENT_THREAD_INFO(r9, r1) - ACCOUNT_CPU_USER_EXIT(r9, r10, r11) -#endif + ACCOUNT_CPU_USER_EXIT(r2, r10, r11) b restore /* N.B. the only way to get here is from the beq following ret_from_except. */ resume_kernel: /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_FLAGS(r9) + lwz r8,TI_FLAGS(r2) andis. r0,r8,_TIF_EMULATE_STACK_STORE@h beq+ 1f @@ -869,7 +852,7 @@ resume_kernel: /* Clear _TIF_EMULATE_STACK_STORE flag */ lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS + addi r5,r2,TI_FLAGS 0: lwarx r8,0,r5 andc r8,r8,r11 #ifdef CONFIG_IBM405_ERR77 @@ -881,7 +864,7 @@ resume_kernel: #ifdef CONFIG_PREEMPT /* check current_thread_info->preempt_count */ - lwz r0,TI_PREEMPT(r9) + lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore andi. r8,r8,_TIF_NEED_RESCHED @@ -897,8 +880,7 @@ resume_kernel: bl trace_hardirqs_off #endif 1: bl preempt_schedule_irq - CURRENT_THREAD_INFO(r9, r1) - lwz r3,TI_FLAGS(r9) + lwz r3,TI_FLAGS(r2) andi. r0,r3,_TIF_NEED_RESCHED bne- 1b #ifdef CONFIG_TRACE_IRQFLAGS @@ -982,6 +964,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) mtcrf 0xFF,r10 mtlr r11 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) /* * Once we put values in SRR0 and SRR1, we are in a state * where exceptions are not recoverable, since taking an @@ -997,9 +982,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) .globl exc_exit_restart exc_exit_restart: lwz r12,_NIP(r1) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r9 REST_4GPRS(9, r1) @@ -1021,6 +1003,9 @@ exc_exit_restart_end: mtlr r11 lwz r10,_CCR(r1) mtcrf 0xff,r10 + /* Clear the exception_marker on the stack to avoid confusing stacktrace */ + li r10, 0 + stw r10, 8(r1) REST_2GPRS(9, r1) .globl exc_exit_restart exc_exit_restart: @@ -1166,10 +1151,6 @@ ret_from_debug_exc: mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) - lwz r9,THREAD_INFO-THREAD(r9) - CURRENT_THREAD_INFO(r10, r1) - lwz r10,TI_PREEMPT(r10) - stw r10,TI_PREEMPT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; @@ -1201,8 +1182,7 @@ load_dbcr0: lis r11,global_dbcr0@ha addi r11,r11,global_dbcr0@l #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_CPU(r9) + lwz r9,TASK_CPU(r2) slwi r9,r9,3 add r11,r11,r9 #endif @@ -1242,8 +1222,7 @@ recheck: LOAD_MSR_KERNEL(r10,MSR_KERNEL) SYNC MTMSRD(r10) /* disable interrupts */ - CURRENT_THREAD_INFO(r9, r1) - lwz r9,TI_FLAGS(r9) + lwz r9,TI_FLAGS(r2) andi. r0,r9,_TIF_NEED_RESCHED bne- do_resched andi. r0,r9,_TIF_USER_WORK_MASK @@ -1292,10 +1271,13 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_601) lwz r3,_TRAP(r1) andi. r0,r3,1 - beq 4f + beq 5f SAVE_NVGPRS(r1) rlwinm r3,r3,0,0,30 stw r3,_TRAP(r1) +5: mfspr r2,SPRN_SPRG_THREAD + addi r2,r2,-THREAD + tovirt(r2,r2) /* set back r2 to current */ 4: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception /* shouldn't return */ @@ -1335,7 +1317,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG_RTAS,r7 + stw r7, THREAD + RTAS_SP(r2) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -1344,7 +1326,8 @@ _GLOBAL(enter_rtas) lwz r9,8(r9) /* original msr value */ addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG_RTAS,r0 + tophys(r7, r2) + stw r0, THREAD + RTAS_SP(r7) mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 435927f549c4..15c67d2c0534 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -166,7 +166,7 @@ system_call: /* label this so stack traces look sane */ li r10,IRQS_ENABLED std r10,SOFTE(r1) - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE bne .Lsyscall_dotrace /* does not return */ @@ -213,7 +213,7 @@ system_call: /* label this so stack traces look sane */ ld r3,RESULT(r1) #endif - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r8,_MSR(r1) #ifdef CONFIG_PPC_BOOK3S @@ -236,18 +236,14 @@ system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. + * + * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we + * could fault on the load of the TI_FLAGS below. */ #ifdef CONFIG_PPC_BOOK3E wrteei 0 #else - /* - * For performance reasons we clear RI the same time that we - * clear EE. We only need to clear RI just before we restore r13 - * below, but batching it with EE saves us one expensive mtmsrd call. - * We have to be careful to restore RI if we branch anywhere from - * here (eg syscall_exit_work). - */ - li r11,0 + li r11,MSR_RI mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ @@ -263,15 +259,7 @@ system_call_exit: bne 3f #endif 2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif ld r8,_MSR(r1) ld r3,RESULT(r1) li r11,-MAX_ERRNO @@ -287,6 +275,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) andi. r6,r8,MSR_PR ld r4,_LINK(r1) +#ifdef CONFIG_PPC_BOOK3S + /* + * Clear MSR_RI, MSR_EE is already and remains disabled. We could do + * this later, but testing shows that doing it here causes less slow + * down than doing it closer to the rfid. + */ + li r11,0 + mtmsrd r11,1 +#endif + beq- 1f ACCOUNT_CPU_USER_EXIT(r13, r11, r12) @@ -348,7 +346,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD - CURRENT_THREAD_INFO(r10, r1) + ld r10, PACA_THREAD_INFO(r13) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls @@ -363,10 +361,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b .Lsyscall_exit .Lsyscall_exit_work: -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. If TIF_NOERROR is set, just save r3 as it is. */ @@ -695,7 +689,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) 2: #endif /* CONFIG_PPC_BOOK3S_64 */ - CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ + clrrdi r7, r8, THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the top of the kernel stack. */ @@ -746,7 +740,7 @@ _GLOBAL(ret_from_except_lite) mtmsrd r10,1 /* Update machine state */ #endif /* CONFIG_PPC_BOOK3E */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r3,_MSR(r1) #ifdef CONFIG_PPC_BOOK3E ld r10,PACACURRENT(r13) @@ -860,7 +854,7 @@ resume_kernel: 1: bl preempt_schedule_irq /* Re-test flags and eventually loop */ - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b @@ -1002,6 +996,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r2,_NIP(r1) mtspr SPRN_SRR0,r2 + /* + * Leaving a stale exception_marker on the stack can confuse + * the reliable stack unwinder later on. Clear it. + */ + li r2,0 + std r2,STACK_FRAME_OVERHEAD-16(r1) + ld r0,GPR0(r1) ld r2,GPR2(r1) ld r3,GPR3(r1) diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S index 52ca2471ee1a..d252f4663a23 100644 --- a/arch/powerpc/kernel/epapr_hcalls.S +++ b/arch/powerpc/kernel/epapr_hcalls.S @@ -21,10 +21,9 @@ #ifndef CONFIG_PPC64 /* epapr_ev_idle() was derived from e500_idle() */ _GLOBAL(epapr_ev_idle) - CURRENT_THREAD_INFO(r3, r1) - PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ + PPC_LL r4, TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4, r4,_TLF_NAPPING /* so when we take an exception */ - PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */ wrteei 1 diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index afb638778f44..49381f32b374 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -77,17 +77,6 @@ special_reg_save: andi. r3,r3,MSR_PR bnelr - /* Copy info into temporary exception thread info */ - ld r11,PACAKSAVE(r13) - CURRENT_THREAD_INFO(r11, r11) - CURRENT_THREAD_INFO(r12, r1) - ld r10,TI_FLAGS(r11) - std r10,TI_FLAGS(r12) - ld r10,TI_PREEMPT(r11) - std r10,TI_PREEMPT(r12) - ld r10,TI_TASK(r11) - std r10,TI_TASK(r12) - /* * Advance to the next TLB exception frame for handler * types that don't do it automatically. @@ -349,6 +338,7 @@ ret_from_mc_except: #define GEN_BTB_FLUSH #define CRIT_BTB_FLUSH #define DBG_BTB_FLUSH +#define MC_BTB_FLUSH #define GDBELL_BTB_FLUSH #endif @@ -504,7 +494,7 @@ exc_##n##_bad_stack: \ * interrupts happen before the wait instruction. */ #define CHECK_NAPPING() \ - CURRENT_THREAD_INFO(r11, r1); \ + ld r11, PACA_THREAD_INFO(r13); \ ld r10,TI_LOCAL_FLAGS(r11); \ andi. r9,r10,_TLF_NAPPING; \ beq+ 1f; \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9e253ce27e08..a5b8fbae56a0 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -68,6 +68,14 @@ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900) OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) + +#ifdef CONFIG_PPC_POWERNV + .globl start_real_trampolines + .globl end_real_trampolines + .globl start_virt_trampolines + .globl end_virt_trampolines +#endif + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. @@ -566,8 +574,36 @@ EXC_COMMON_BEGIN(mce_return) RFI_TO_KERNEL b . -EXC_REAL(data_access, 0x300, 0x80) -EXC_VIRT(data_access, 0x4300, 0x80, 0x300) +EXC_REAL_BEGIN(data_access, 0x300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) + b tramp_real_data_access +EXC_REAL_END(data_access, 0x300, 0x80) + +TRAMP_REAL_BEGIN(tramp_real_data_access) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x300) + /* + * DAR/DSISR must be read before setting MSR[RI], because + * a d-side MCE will clobber those registers so is not + * recoverable if they are live. + */ + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2(data_access_common, EXC_STD) + +EXC_VIRT_BEGIN(data_access, 0x4300, 0x80) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x300) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2_RELON(data_access_common, EXC_STD) +EXC_VIRT_END(data_access, 0x4300, 0x80) + TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) EXC_COMMON_BEGIN(data_access_common) @@ -575,11 +611,8 @@ EXC_COMMON_BEGIN(data_access_common) * Here r13 points to the paca, r9 contains the saved CR, * SRR0 and SRR1 are saved in r11 and r12, * r9 - r13 are saved in paca->exgen. + * EX_DAR and EX_DSISR have saved DAR/DSISR */ - mfspr r10,SPRN_DAR - std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) @@ -596,18 +629,29 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) -EXCEPTION_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, KVMTEST_PR, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) + b tramp_real_data_access_slb EXC_REAL_END(data_access_slb, 0x380, 0x80) +TRAMP_REAL_BEGIN(tramp_real_data_access_slb) +EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) +EXCEPTION_PROLOG_2(data_access_slb_common, EXC_STD) + EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) -EXCEPTION_RELON_PROLOG(PACA_EXSLB, data_access_slb_common, EXC_STD, NOTEST, 0x380); +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXSLB) +EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) + mfspr r10,SPRN_DAR + std r10,PACA_EXSLB+EX_DAR(r13) +EXCEPTION_PROLOG_2_RELON(data_access_slb_common, EXC_STD) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) EXC_COMMON_BEGIN(data_access_slb_common) - mfspr r10,SPRN_DAR - std r10,PACA_EXSLB+EX_DAR(r13) EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) @@ -703,14 +747,30 @@ TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) -EXC_REAL(alignment, 0x600, 0x100) -EXC_VIRT(alignment, 0x4600, 0x100, 0x600) -TRAMP_KVM(PACA_EXGEN, 0x600) -EXC_COMMON_BEGIN(alignment_common) +EXC_REAL_BEGIN(alignment, 0x600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, 0x600) mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR std r10,PACA_EXGEN+EX_DAR(r13) - mfspr r10,SPRN_DSISR - stw r10,PACA_EXGEN+EX_DSISR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2(alignment_common, EXC_STD) +EXC_REAL_END(alignment, 0x600, 0x100) + +EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) +SET_SCRATCH0(r13) /* save r13 */ +EXCEPTION_PROLOG_0(PACA_EXGEN) +EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x600) + mfspr r10,SPRN_DAR + mfspr r11,SPRN_DSISR + std r10,PACA_EXGEN+EX_DAR(r13) + stw r11,PACA_EXGEN+EX_DSISR(r13) +EXCEPTION_PROLOG_2_RELON(alignment_common, EXC_STD) +EXC_VIRT_END(alignment, 0x4600, 0x100) + +TRAMP_KVM(PACA_EXGEN, 0x600) +EXC_COMMON_BEGIN(alignment_common) EXCEPTION_PROLOG_COMMON(0x600, PACA_EXGEN) ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) @@ -1629,7 +1689,7 @@ do_hash_page: ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACA_THREAD_INFO(r13) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ bne 77f /* then don't call hash_page now */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 05b08db3901d..ce6a972f2584 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -261,7 +261,7 @@ __secondary_hold_acknowledge: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ @@ -352,9 +352,8 @@ i##n: \ * registers that might have bad values includes all the GPRs * and all the BATs. We indicate that we are in RTAS by putting * a non-zero value, the address of the exception frame to use, - * in SPRG2. The machine check handler checks SPRG2 and uses its - * value if it is non-zero. If we ever needed to free up SPRG2, - * we could use a field in the thread_info or thread_struct instead. + * in thread.rtas_sp. The machine check handler checks thread.rtas_sp + * and uses its value if it is non-zero. * (Other exception handlers assume that r1 is a valid kernel stack * pointer when we take an exception from supervisor mode.) * -- paulus. @@ -365,16 +364,15 @@ i##n: \ mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG_RTAS - cmpwi 0,r11,0 - bne 7f + mfspr r11, SPRN_SPRG_THREAD + lwz r11, RTAS_SP(r11) + cmpwi cr1, r11, 0 + bne cr1, 7f #endif /* CONFIG_PPC_CHRP */ EXCEPTION_PROLOG_1 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG_RTAS - cmpwi cr1,r4,0 bne cr1,1f #endif EXC_XFER_STD(0x200, machine_check_exception) @@ -500,18 +498,22 @@ InstructionTLBMiss: */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_USER|_PAGE_PRESENT|_PAGE_EXEC /* low addresses tested as user */ - lwz r2,PGDIR(r2) +#endif + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC +#else + li r1,_PAGE_PRESENT | _PAGE_EXEC +#endif +#if defined(CONFIG_MODULES) || defined(CONFIG_DEBUG_PAGEALLOC) bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +#endif +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ @@ -519,20 +521,10 @@ InstructionTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ - /* - * NOTE! We are assuming this is not an SMP system, otherwise - * we would need to update the pte atomically with lwarx/stwcx. - */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ - ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + ori r1, r1, 0xe05 /* clear out reserved bits */ + andc r1, r0, r1 /* PP = user? 2 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -576,16 +568,16 @@ DataLoadTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ - lwz r2,PGDIR(r2) + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1, _PAGE_PRESENT | _PAGE_ACCESSED +#else + li r1, _PAGE_PRESENT +#endif bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -593,20 +585,16 @@ DataLoadTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED /* set _PAGE_ACCESSED in pte */ /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed bit) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ - rlwinm r2,r0,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ - and r1,r1,r2 /* writable if _RW and _DIRTY */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? (rw&dirty? 2: 3): 0 */ + andc r1,r0,r1 /* PP = user? rw? 2: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -660,16 +648,16 @@ DataStoreTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG_THREAD - li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ - lwz r2,PGDIR(r2) + mfspr r2, SPRN_SPRG_PGDIR +#ifdef CONFIG_SWAP + li r1, _PAGE_RW | _PAGE_PRESENT | _PAGE_ACCESSED +#else + li r1, _PAGE_RW | _PAGE_PRESENT +#endif bge- 112f - mfspr r2,SPRN_SRR1 /* and MSR_PR bit from SRR1 */ - rlwimi r1,r2,32-12,29,29 /* shift MSR_PR to _PAGE_USER posn */ - lis r2,swapper_pg_dir@ha /* if kernel address, use */ - addi r2,r2,swapper_pg_dir@l /* kernel page table */ -112: tophys(r2,r2) - rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ + lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ +112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ @@ -677,12 +665,10 @@ DataStoreTLBMiss: lwz r0,0(r2) /* get linux-style pte */ andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ - ori r0,r0,_PAGE_ACCESSED|_PAGE_DIRTY /* * NOTE! We are assuming this is not an SMP system, otherwise * we would need to update the pte atomically with lwarx/stwcx. */ - stw r0,0(r2) /* update PTE (accessed/dirty bits) */ /* Convert linux-style PTE to low word of PPC-style PTE */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ li r1,0xe05 /* clear out reserved bits & PP lsb */ @@ -845,12 +831,12 @@ __secondary_start: bl init_idle_6xx #endif /* CONFIG_PPC_BOOK3S_32 */ - /* get current_thread_info and current */ - lis r1,secondary_ti@ha - tophys(r1,r1) - lwz r1,secondary_ti@l(r1) - tophys(r2,r1) - lwz r2,TI_TASK(r2) + /* get current's stack and current */ + lis r2,secondary_current@ha + tophys(r2,r2) + lwz r2,secondary_current@l(r2) + tophys(r1,r2) + lwz r1,TASK_STACK(r1) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD @@ -865,8 +851,10 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -950,8 +938,10 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 +#ifdef CONFIG_PPC_RTAS li r3,0 - mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ + stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ +#endif /* stack */ lis r1,init_thread_union@ha @@ -1022,15 +1012,16 @@ _ENTRY(switch_mmu_context) li r0,NUM_USER_SEGMENTS mtctr r0 + lwz r4, MM_PGD(r4) #ifdef CONFIG_BDI_SWITCH /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - lwz r4,MM_PGD(r4) - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif + tophys(r4, r4) + mtspr SPRN_SPRG_PGDIR, r4 li r4,0 isync 3: @@ -1105,6 +1096,41 @@ BEGIN_MMU_FTR_SECTION END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +_ENTRY(update_bats) + lis r4, 1f@h + ori r4, r4, 1f@l + tophys(r4, r4) + mfmsr r6 + mflr r7 + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR) + rlwinm r0, r6, 0, ~MSR_RI + rlwinm r0, r0, 0, ~MSR_EE + mtmsr r0 + mtspr SPRN_SRR0, r4 + mtspr SPRN_SRR1, r3 + SYNC + RFI +1: bl clear_bats + lis r3, BATS@ha + addi r3, r3, BATS@l + tophys(r3, r3) + LOAD_BAT(0, r3, r4, r5) + LOAD_BAT(1, r3, r4, r5) + LOAD_BAT(2, r3, r4, r5) + LOAD_BAT(3, r3, r4, r5) +BEGIN_MMU_FTR_SECTION + LOAD_BAT(4, r3, r4, r5) + LOAD_BAT(5, r3, r4, r5) + LOAD_BAT(6, r3, r4, r5) + LOAD_BAT(7, r3, r4, r5) +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) + li r3, MSR_KERNEL & ~(MSR_IR | MSR_DR | MSR_RI) + mtmsr r3 + mtspr SPRN_SRR0, r7 + mtspr SPRN_SRR1, r6 + SYNC + RFI + flush_tlbs: lis r10, 0x40 1: addic. r10, r10, -0x1000 diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index b19d78410511..a9c934f2319b 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -115,7 +115,7 @@ _ENTRY(saved_ksp_limit) andi. r11,r11,MSR_PR; \ beq 1f; \ mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ + lwz r1,TASK_STACK-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ tophys(r11,r1); \ @@ -158,7 +158,7 @@ _ENTRY(saved_ksp_limit) beq 1f; \ /* COMING FROM USER MODE */ \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11,TASK_STACK-THREAD(r11); /* this thread's kernel stack */\ 1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ stw r10,_CCR(r11); /* save various registers */\ @@ -953,9 +953,8 @@ _GLOBAL(set_context) /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is the second parameter. */ - lis r5, KERNELBASE@h - lwz r5, 0xf0(r5) - stw r4, 0x4(r5) + lis r5, abatron_pteptrs@ha + stw r4, abatron_pteptrs@l + 0x4(r5) #endif sync mtspr SPRN_PID,r3 diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index bf23c19c92d6..37117ab11584 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1019,10 +1019,10 @@ _GLOBAL(start_secondary_47x) /* Now we can get our task struct and real stack pointer */ - /* Get current_thread_info and current */ - lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + /* Get current's stack and current */ + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) + lwz r1,TASK_STACK(r2) /* Current stack pointer */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4898e9491a1c..3fad8d499767 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -801,21 +801,19 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Initialize the kernel stack */ - LOAD_REG_ADDR(r3, current_set) - sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r14,r3,r28 - addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r14,PACAKSAVE(r13) - - /* Do early setup for that CPU (SLB and hash table pointer) */ + /* + * Do early setup for this CPU, in particular initialising the MMU so we + * can turn it on below. This is a call to C, which is OK, we're still + * running on the emergency stack. + */ bl early_setup_secondary /* - * setup the new stack pointer, but *don't* use this until - * translation is on. + * The primary has initialized our kernel stack for us in the paca, grab + * it and put it in r1. We must *not* use it until we turn on the MMU + * below, because it may not be inside the RMO. */ - mr r1, r14 + ld r1, PACAKSAVE(r13) /* Clear backchain so we get nice backtraces */ li r7,0 diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 20cc816b3508..03c73b4c6435 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -142,7 +142,7 @@ instruction_counter: tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ mfspr r11,SPRN_SPRG_THREAD; \ - lwz r11,THREAD_INFO-THREAD(r11); \ + lwz r11,TASK_STACK-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ 1: subi r11,r11,INT_FRAME_SIZE /* alloc exc. frame */ @@ -292,6 +292,17 @@ SystemCall: */ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) +/* Called from DataStoreTLBMiss when perf TLB misses events are activated */ +#ifdef CONFIG_PERF_EVENTS + patch_site 0f, patch__dtlbmiss_perf +0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + addi r10, r10, 1 + stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) + mfspr r10, SPRN_SPRG_SCRATCH0 + mfspr r11, SPRN_SPRG_SCRATCH1 + rfi +#endif + . = 0x1100 /* * For the MPC8xx, this is a software tablewalk to load the instruction @@ -337,8 +348,8 @@ InstructionTLBMiss: rlwinm r10, r10, 16, 0xfff8 cmpli cr0, r10, PAGE_OFFSET@h #ifndef CONFIG_PIN_TLB_TEXT - /* It is assumed that kernel code fits into the first 8M page */ -0: cmpli cr7, r10, (PAGE_OFFSET + 0x0800000)@h + /* It is assumed that kernel code fits into the first 32M */ +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__itlbmiss_linmem_top #endif #endif @@ -405,10 +416,20 @@ InstructionTLBMiss: #ifndef CONFIG_PIN_TLB_TEXT ITLBMissLinear: mtcr r11 +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23 + patch_site 0f, patch__itlbmiss_linmem_top8 + + mfspr r10, SPRN_SRR0 +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MI_PS8MEG | MI_SVALID - mtspr SPRN_MI_TWC, r11 rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#endif + mtspr SPRN_MI_TWC, r11 ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MI_RPN, r10 /* Update TLB entry */ @@ -434,7 +455,7 @@ DataStoreTLBMiss: #ifndef CONFIG_PIN_TLB_IMMR cmpli cr6, r10, VIRT_IMMR_BASE@h #endif -0: cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h +0: cmpli cr7, r10, (PAGE_OFFSET + 0x2000000)@h patch_site 0b, patch__dtlbmiss_linmem_top mfspr r10, SPRN_M_TWB /* Get level 1 table */ @@ -494,16 +515,6 @@ DataStoreTLBMiss: rfi patch_site 0b, patch__dtlbmiss_exit_1 -#ifdef CONFIG_PERF_EVENTS - patch_site 0f, patch__dtlbmiss_perf -0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - addi r10, r10, 1 - stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0) - mfspr r10, SPRN_SPRG_SCRATCH0 - mfspr r11, SPRN_SPRG_SCRATCH1 - rfi -#endif - DTLBMissIMMR: mtcr r11 /* Set 512k byte guarded page and mark it valid */ @@ -525,10 +536,29 @@ DTLBMissIMMR: DTLBMissLinear: mtcr r11 + rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#if defined(CONFIG_STRICT_KERNEL_RWX) && CONFIG_DATA_SHIFT < 23 + patch_site 0f, patch__dtlbmiss_romem_top8 + +0: subis r11, r10, (PAGE_OFFSET - 0x80000000)@ha + rlwinm r11, r11, 0, 0xff800000 + neg r10, r11 + or r11, r11, r10 + rlwinm r11, r11, 4, MI_PS8MEG ^ MI_PS512K + ori r11, r11, MI_PS512K | MI_SVALID + mfspr r10, SPRN_MD_EPN + rlwinm r10, r10, 0, 0x0ff80000 /* 8xx supports max 256Mb RAM */ +#else /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID +#endif mtspr SPRN_MD_TWC, r11 - rlwinm r10, r10, 20, 0x0f800000 /* 8xx supports max 256Mb RAM */ +#ifdef CONFIG_STRICT_KERNEL_RWX + patch_site 0f, patch__dtlbmiss_romem_top + +0: subis r11, r10, 0 + rlwimi r10, r11, 11, _PAGE_RO +#endif ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \ _PAGE_PRESENT mtspr SPRN_MD_RPN, r10 /* Update TLB entry */ @@ -551,11 +581,11 @@ InstructionTLBError: mr r4,r12 andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ andis. r10,r9,SRR1_ISI_NOPT@h - beq+ 1f + beq+ .Litlbie tlbie r4 -itlbie: /* 0x400 is InstructionAccess exception, needed by bad_page_fault() */ -1: EXC_XFER_LITE(0x400, handle_page_fault) +.Litlbie: + EXC_XFER_LITE(0x400, handle_page_fault) /* This is the data TLB error on the MPC8xx. This could be due to * many reasons, including a dirty update to a pte. We bail out to @@ -577,10 +607,10 @@ DARFixed:/* Return from dcbx instruction bug workaround */ stw r5,_DSISR(r11) mfspr r4,SPRN_DAR andis. r10,r5,DSISR_NOHPTE@h - beq+ 1f + beq+ .Ldtlbie tlbie r4 -dtlbie: -1: li r10,RPN_PATTERN +.Ldtlbie: + li r10,RPN_PATTERN mtspr SPRN_DAR,r10 /* Tag DAR, to be used in DTLB Error */ /* 0x300 is DataAccess exception, needed by bad_page_fault() */ EXC_XFER_LITE(0x300, handle_page_fault) @@ -603,8 +633,8 @@ DataBreakpoint: mtspr SPRN_SPRG_SCRATCH1, r11 mfcr r10 mfspr r11, SPRN_SRR0 - cmplwi cr0, r11, (dtlbie - PAGE_OFFSET)@l - cmplwi cr7, r11, (itlbie - PAGE_OFFSET)@l + cmplwi cr0, r11, (.Ldtlbie - PAGE_OFFSET)@l + cmplwi cr7, r11, (.Litlbie - PAGE_OFFSET)@l beq- cr0, 11f beq- cr7, 11f EXCEPTION_PROLOG_1 @@ -886,28 +916,11 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -#ifdef CONFIG_PIN_TLB_TEXT - lis r8, MI_RSV4I@h - ori r8, r8, 0x1c00 - - mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ -#endif - #ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif - /* Now map the lower 8 Meg into the ITLB. */ - lis r8, KERNELBASE@h /* Create vaddr for TLB */ - ori r8, r8, MI_EVALID /* Mark it valid */ - mtspr SPRN_MI_EPN, r8 - li r8, MI_PS8MEG /* Set 8M byte page */ - ori r8, r8, MI_SVALID /* Make it valid */ - mtspr SPRN_MI_TWC, r8 - li r8, MI_BOOTINIT /* Create RPN for address 0 */ - mtspr SPRN_MI_RPN, r8 /* Store TLB entry */ - lis r8, MI_APG_INIT@h /* Set protection modes */ ori r8, r8, MI_APG_INIT@l mtspr SPRN_MI_AP, r8 @@ -937,6 +950,34 @@ initial_mmu: mtspr SPRN_MD_RPN, r8 #endif + /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */ +#ifdef CONFIG_PIN_TLB_TEXT + lis r8, MI_RSV4I@h + ori r8, r8, 0x1c00 +#endif + li r9, 4 /* up to 4 pages of 8M */ + mtctr r9 + lis r9, KERNELBASE@h /* Create vaddr for TLB */ + li r10, MI_PS8MEG | MI_SVALID /* Set 8M byte page */ + li r11, MI_BOOTINIT /* Create RPN for address 0 */ + lis r12, _einittext@h + ori r12, r12, _einittext@l +1: +#ifdef CONFIG_PIN_TLB_TEXT + mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ + addi r8, r8, 0x100 +#endif + + ori r0, r9, MI_EVALID /* Mark it valid */ + mtspr SPRN_MI_EPN, r0 + mtspr SPRN_MI_TWC, r10 + mtspr SPRN_MI_RPN, r11 /* Store TLB entry */ + addis r9, r9, 0x80 + addis r11, r11, 0x80 + + cmpl cr0, r9, r12 + bdnzf gt, 1b + /* Since the cache is enabled according to the information we * just loaded into the TLB, invalidate and enable the caches here. * We should probably check/set other modes....later. @@ -989,5 +1030,6 @@ swapper_pg_dir: /* Room for two PTE table poiners, usually the kernel and current user * pointer to their respective root page table (pgdir). */ + .globl abatron_pteptrs abatron_pteptrs: .space 8 diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 306e26c073a0..1b22a8dea399 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -55,7 +55,7 @@ END_BTB_FLUSH_SECTION beq 1f; \ BOOKE_CLEAR_BTB(r11) \ /* if from user, start at top of this thread's kernel stack */ \ - lwz r11, THREAD_INFO-THREAD(r10); \ + lwz r11, TASK_STACK - THREAD(r10); \ ALLOC_STACK_FRAME(r11, THREAD_SIZE); \ 1 : subi r11, r11, INT_FRAME_SIZE; /* Allocate exception frame */ \ stw r13, _CCR(r11); /* save various registers */ \ @@ -142,7 +142,7 @@ END_BTB_FLUSH_SECTION BOOKE_CLEAR_BTB(r10) \ andi. r11,r11,MSR_PR; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ - lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ + lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ beq 1f; \ /* COMING FROM USER MODE */ \ @@ -155,13 +155,7 @@ END_BTB_FLUSH_SECTION stw r10,GPR11(r11); \ b 2f; \ /* COMING FROM PRIV MODE */ \ -1: lwz r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r11); \ - lwz r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_FLAGS-EXC_LVL_FRAME_OVERHEAD(r8); \ - stw r10,TI_PREEMPT-EXC_LVL_FRAME_OVERHEAD(r8); \ - lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ - stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ - mr r11,r8; \ +1: mr r11, r8; \ 2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ stw r12,GPR12(r11); /* save various registers */\ mflr r10; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 2386ce2a9c6e..1881127682e9 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -243,8 +243,9 @@ set_ivor: li r0,0 stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) - CURRENT_THREAD_INFO(r22, r1) - stw r24, TI_CPU(r22) +#ifdef CONFIG_SMP + stw r24, TASK_CPU(r2) +#endif bl early_init @@ -717,8 +718,7 @@ finish_tlb_load: /* Get the next_tlbcam_idx percpu var */ #ifdef CONFIG_SMP - lwz r12, THREAD_INFO-THREAD(r12) - lwz r15, TI_CPU(r12) + lwz r15, TASK_CPU-THREAD(r12) lis r14, __per_cpu_offset@h ori r14, r14, __per_cpu_offset@l rlwinm r15, r15, 2, 0, 29 @@ -1089,10 +1089,10 @@ __secondary_start: mr r4,r24 /* Why? */ bl call_setup_cpu - /* get current_thread_info and current */ - lis r1,secondary_ti@ha - lwz r1,secondary_ti@l(r1) - lwz r2,TI_TASK(r1) + /* get current's stack and current */ + lis r2,secondary_current@ha + lwz r2,secondary_current@l(r2) + lwz r1,TASK_STACK(r2) /* stack */ addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/idle_6xx.S b/arch/powerpc/kernel/idle_6xx.S index ff026c9d3cab..c5e7f5bb2e66 100644 --- a/arch/powerpc/kernel/idle_6xx.S +++ b/arch/powerpc/kernel/idle_6xx.S @@ -136,10 +136,9 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) - lwz r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ + lwz r8,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ - stw r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ + stw r8,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ mfmsr r7 ori r7,r7,MSR_EE oris r7,r7,MSR_POW@h @@ -159,8 +158,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r11) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/idle_book3e.S b/arch/powerpc/kernel/idle_book3e.S index 4e0d94d02030..31e732c378ad 100644 --- a/arch/powerpc/kernel/idle_book3e.S +++ b/arch/powerpc/kernel/idle_book3e.S @@ -63,7 +63,7 @@ _GLOBAL(\name) 1: /* Let's set the _TLF_NAPPING flag so interrupts make us return * to the right spot */ - CURRENT_THREAD_INFO(r11, r1) + ld r11, PACACURRENT(r13) ld r10,TI_LOCAL_FLAGS(r11) ori r10,r10,_TLF_NAPPING std r10,TI_LOCAL_FLAGS(r11) diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S index 583e55ac7d26..69dfcd2ca011 100644 --- a/arch/powerpc/kernel/idle_e500.S +++ b/arch/powerpc/kernel/idle_e500.S @@ -22,10 +22,9 @@ .text _GLOBAL(e500_idle) - CURRENT_THREAD_INFO(r3, r1) - lwz r4,TI_LOCAL_FLAGS(r3) /* set napping bit */ + lwz r4,TI_LOCAL_FLAGS(r2) /* set napping bit */ ori r4,r4,_TLF_NAPPING /* so when we take an exception */ - stw r4,TI_LOCAL_FLAGS(r3) /* it will return to our caller */ + stw r4,TI_LOCAL_FLAGS(r2) /* it will return to our caller */ #ifdef CONFIG_PPC_E500MC wrteei 1 @@ -88,8 +87,7 @@ _GLOBAL(power_save_ppc32_restore) stw r9,_NIP(r11) /* make it do a blr */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r12, r1) - lwz r11,TI_CPU(r12) /* get cpu number * 4 */ + lwz r11,TASK_CPU(r2) /* get cpu number * 4 */ slwi r11,r11,2 #else li r11,0 diff --git a/arch/powerpc/kernel/idle_power4.S b/arch/powerpc/kernel/idle_power4.S index a09b3c7ca176..a2fdb0a34b75 100644 --- a/arch/powerpc/kernel/idle_power4.S +++ b/arch/powerpc/kernel/idle_power4.S @@ -68,7 +68,7 @@ BEGIN_FTR_SECTION DSSALL sync END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) - CURRENT_THREAD_INFO(r9, r1) + ld r9, PACA_THREAD_INFO(r13) ld r8,TI_LOCAL_FLAGS(r9) /* set napping bit */ ori r8,r8,_TLF_NAPPING /* so when we take an exception */ std r8,TI_LOCAL_FLAGS(r9) /* it will return to our caller */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 916ddc4aac44..8a936723c791 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -618,9 +618,8 @@ static inline void check_stack_overflow(void) sp = current_stack_pointer() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ - if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { - pr_err("do_IRQ: stack overflow: %ld\n", - sp - sizeof(struct thread_info)); + if (unlikely(sp < 2048)) { + pr_err("do_IRQ: stack overflow: %ld\n", sp); dump_stack(); } #endif @@ -660,36 +659,21 @@ void __do_irq(struct pt_regs *regs) void do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - struct thread_info *curtp, *irqtp, *sirqtp; + void *cursp, *irqsp, *sirqsp; /* Switch to the irq stack to handle this */ - curtp = current_thread_info(); - irqtp = hardirq_ctx[raw_smp_processor_id()]; - sirqtp = softirq_ctx[raw_smp_processor_id()]; + cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + irqsp = hardirq_ctx[raw_smp_processor_id()]; + sirqsp = softirq_ctx[raw_smp_processor_id()]; /* Already there ? */ - if (unlikely(curtp == irqtp || curtp == sirqtp)) { + if (unlikely(cursp == irqsp || cursp == sirqsp)) { __do_irq(regs); set_irq_regs(old_regs); return; } - - /* Prepare the thread_info in the irq stack */ - irqtp->task = curtp->task; - irqtp->flags = 0; - - /* Copy the preempt_count so that the [soft]irq checks work. */ - irqtp->preempt_count = curtp->preempt_count; - /* Switch stack and call */ - call_do_irq(regs, irqtp); - - /* Restore stack limit */ - irqtp->task = NULL; - - /* Copy back updates to the thread_info */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); + call_do_irq(regs, irqsp); set_irq_regs(old_regs); } @@ -698,90 +682,20 @@ void __init init_IRQ(void) { if (ppc_md.init_IRQ) ppc_md.init_IRQ(); - - exc_lvl_ctx_init(); - - irq_ctx_init(); } #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) -struct thread_info *critirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *dbgirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *mcheckirq_ctx[NR_CPUS] __read_mostly; - -void exc_lvl_ctx_init(void) -{ - struct thread_info *tp; - int i, cpu_nr; - - for_each_possible_cpu(i) { -#ifdef CONFIG_PPC64 - cpu_nr = i; -#else -#ifdef CONFIG_SMP - cpu_nr = get_hard_smp_processor_id(i); -#else - cpu_nr = 0; -#endif +void *critirq_ctx[NR_CPUS] __read_mostly; +void *dbgirq_ctx[NR_CPUS] __read_mostly; +void *mcheckirq_ctx[NR_CPUS] __read_mostly; #endif - memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = critirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - -#ifdef CONFIG_BOOKE - memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = dbgirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = 0; - - memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE); - tp = mcheckirq_ctx[cpu_nr]; - tp->cpu = cpu_nr; - tp->preempt_count = HARDIRQ_OFFSET; -#endif - } -} -#endif - -struct thread_info *softirq_ctx[NR_CPUS] __read_mostly; -struct thread_info *hardirq_ctx[NR_CPUS] __read_mostly; - -void irq_ctx_init(void) -{ - struct thread_info *tp; - int i; - - for_each_possible_cpu(i) { - memset((void *)softirq_ctx[i], 0, THREAD_SIZE); - tp = softirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - - memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); - tp = hardirq_ctx[i]; - tp->cpu = i; - klp_init_thread_info(tp); - } -} +void *softirq_ctx[NR_CPUS] __read_mostly; +void *hardirq_ctx[NR_CPUS] __read_mostly; void do_softirq_own_stack(void) { - struct thread_info *curtp, *irqtp; - - curtp = current_thread_info(); - irqtp = softirq_ctx[smp_processor_id()]; - irqtp->task = curtp->task; - irqtp->flags = 0; - call_do_softirq(irqtp); - irqtp->task = NULL; - - /* Set any flag that may have been set on the - * alternate stack - */ - if (irqtp->flags) - set_bits(irqtp->flags, &curtp->flags); + call_do_softirq(softirq_ctx[smp_processor_id()]); } irq_hw_number_t virq_to_hw(unsigned int virq) @@ -827,11 +741,6 @@ int irq_choose_cpu(const struct cpumask *mask) } #endif -int arch_early_irq_init(void) -{ - return 0; -} - #ifdef CONFIG_PPC64 static int __init setup_noirqdistrib(char *str) { diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index e1865565f0ae..7dd55eb1259d 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -151,41 +151,13 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs) return 1; } -static DEFINE_PER_CPU(struct thread_info, kgdb_thread_info); static int kgdb_singlestep(struct pt_regs *regs) { - struct thread_info *thread_info, *exception_thread_info; - struct thread_info *backup_current_thread_info = - this_cpu_ptr(&kgdb_thread_info); - if (user_mode(regs)) return 0; - /* - * On Book E and perhaps other processors, singlestep is handled on - * the critical exception stack. This causes current_thread_info() - * to fail, since it it locates the thread_info by masking off - * the low bits of the current stack pointer. We work around - * this issue by copying the thread_info from the kernel stack - * before calling kgdb_handle_exception, and copying it back - * afterwards. On most processors the copy is avoided since - * exception_thread_info == thread_info. - */ - thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); - exception_thread_info = current_thread_info(); - - if (thread_info != exception_thread_info) { - /* Save the original current_thread_info. */ - memcpy(backup_current_thread_info, exception_thread_info, sizeof *thread_info); - memcpy(exception_thread_info, thread_info, sizeof *thread_info); - } - kgdb_handle_exception(0, SIGTRAP, 0, regs); - if (thread_info != exception_thread_info) - /* Restore current_thread_info lastly. */ - memcpy(exception_thread_info, backup_current_thread_info, sizeof *thread_info); - return 1; } diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index a0f6f45005bd..75692c327ba0 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -317,10 +317,8 @@ void default_machine_kexec(struct kimage *image) * We setup preempt_count to avoid using VMX in memcpy. * XXX: the task struct will likely be invalid once we do the copy! */ - kexec_stack.thread_info.task = current_thread_info()->task; - kexec_stack.thread_info.flags = 0; - kexec_stack.thread_info.preempt_count = HARDIRQ_OFFSET; - kexec_stack.thread_info.cpu = current_thread_info()->cpu; + current_thread_info()->flags = 0; + current_thread_info()->preempt_count = HARDIRQ_OFFSET; /* We need a static PACA, too; copy this CPU's PACA over and switch to * it. Also poison per_cpu_offset and NULL lppaca to catch anyone using diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index bd933a75f0bc..b5fec1f9751a 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -31,6 +31,7 @@ #include <asm/machdep.h> #include <asm/mce.h> +#include <asm/nmi.h> static DEFINE_PER_CPU(int, mce_nest_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); @@ -301,13 +302,13 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; evt = this_cpu_ptr(&mce_event_queue[index]); - machine_check_print_event_info(evt, false); + machine_check_print_event_info(evt, false, false); __this_cpu_dec(mce_queue_count); } } void machine_check_print_event_info(struct machine_check_event *evt, - bool user_mode) + bool user_mode, bool in_guest) { const char *level, *sevstr, *subtype; static const char *mc_ue_types[] = { @@ -387,7 +388,9 @@ void machine_check_print_event_info(struct machine_check_event *evt, evt->disposition == MCE_DISPOSITION_RECOVERED ? "Recovered" : "Not recovered"); - if (user_mode) { + if (in_guest) { + printk("%s Guest NIP: %016llx\n", level, evt->srr0); + } else if (user_mode) { printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, evt->srr0, current->pid, current->comm); } else { @@ -488,6 +491,8 @@ long machine_check_early(struct pt_regs *regs) { long handled = 0; + hv_nmi_check_nonrecoverable(regs); + /* * See if platform is capable of handling machine check. */ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 57d2ffb2d45c..0dda4f8e3d7a 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -46,11 +46,10 @@ _GLOBAL(call_do_softirq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r3,THREAD_INFO_GAP + stw r3, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) mr r1,r3 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_softirq lwz r10,8(r1) lwz r1,0(r1) @@ -60,17 +59,16 @@ _GLOBAL(call_do_softirq) blr /* - * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); + * void call_do_irq(struct pt_regs *regs, void *sp); */ _GLOBAL(call_do_irq) mflr r0 stw r0,4(r1) lwz r10,THREAD+KSP_LIMIT(r2) - addi r11,r4,THREAD_INFO_GAP + stw r4, THREAD+KSP_LIMIT(r2) stwu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r4) mr r1,r4 stw r10,8(r1) - stw r11,THREAD+KSP_LIMIT(r2) bl __do_irq lwz r10,8(r1) lwz r1,0(r1) @@ -183,10 +181,13 @@ _GLOBAL(low_choose_750fx_pll) or r4,r4,r5 mtspr SPRN_HID1,r4 +#ifdef CONFIG_SMP /* Store new HID1 image */ - CURRENT_THREAD_INFO(r6, r1) - lwz r6,TI_CPU(r6) + lwz r6,TASK_CPU(r2) slwi r6,r6,2 +#else + li r6, 0 +#endif addis r6,r6,nap_save_hid1@ha stw r4,nap_save_hid1@l(r6) @@ -599,7 +600,7 @@ EXPORT_SYMBOL(__bswapdi2) #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ - CURRENT_THREAD_INFO(r1, r1) + rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD li r3,0 stw r3,0(r1) /* Zero the stack frame pointer */ diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 4538e8ddde80..ff4b7539cbdf 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -63,19 +63,13 @@ resource_size_t isa_mem_base; EXPORT_SYMBOL(isa_mem_base); -static const struct dma_map_ops *pci_dma_ops = &dma_nommu_ops; +static const struct dma_map_ops *pci_dma_ops; void set_pci_dma_ops(const struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } -const struct dma_map_ops *get_pci_dma_ops(void) -{ - return pci_dma_ops; -} -EXPORT_SYMBOL(get_pci_dma_ops); - /* * This function should run under locking protection, specifically * hose_spinlock. @@ -358,6 +352,17 @@ struct pci_controller* pci_find_hose_for_OF_device(struct device_node* node) return NULL; } +struct pci_controller *pci_find_controller_for_domain(int domain_nr) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + if (hose->global_number == domain_nr) + return hose; + + return NULL; +} + /* * Reads the interrupt pin to determine if interrupt is use by card. * If the interrupt is used, then gets the interrupt line from the @@ -973,7 +978,7 @@ static void pcibios_setup_device(struct pci_dev *dev) /* Hook up default DMA ops */ set_dma_ops(&dev->dev, pci_dma_ops); - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + dev->dev.archdata.dma_offset = PCI_DRAM_OFFSET; /* Additional platform DMA/iommu setup */ phb = pci_bus_to_host(dev->bus); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ce393df243aa..dd9e0d5386ee 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -176,7 +176,7 @@ static void __giveup_fpu(struct task_struct *tsk) save_fpu(tsk); msr = tsk->thread.regs->msr; - msr &= ~MSR_FP; + msr &= ~(MSR_FP|MSR_FE0|MSR_FE1); #ifdef CONFIG_VSX if (cpu_has_feature(CPU_FTR_VSX)) msr &= ~MSR_VSX; @@ -1231,8 +1231,8 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) { - restore_math(current_thread_info()->task->thread.regs); + if (current->thread.regs) { + restore_math(current->thread.regs); /* * The copy-paste buffer can only store into foreign real @@ -1242,7 +1242,7 @@ struct task_struct *__switch_to(struct task_struct *prev, * mappings, we must issue a cp_abort to clear any state and * prevent snooping, corruption or a covert channel. */ - if (current_thread_info()->task->thread.used_vas) + if (current->thread.used_vas) asm volatile(PPC_CP_ABORT); } #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -1634,7 +1634,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; struct thread_info *ti = task_thread_info(p); - klp_init_thread_info(ti); + klp_init_thread_info(p); /* Copy registers */ sp -= sizeof(struct pt_regs); @@ -1691,8 +1691,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, sp -= STACK_FRAME_OVERHEAD; p->thread.ksp = sp; #ifdef CONFIG_PPC32 - p->thread.ksp_limit = (unsigned long)task_stack_page(p) + - _ALIGN_UP(sizeof(struct thread_info), 16); + p->thread.ksp_limit = (unsigned long)end_of_stack(p); #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT p->thread.ptrace_bps[0] = NULL; @@ -1995,21 +1994,14 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p, unsigned long stack_page; unsigned long cpu = task_cpu(p); - /* - * Avoid crashing if the stack has overflowed and corrupted - * task_cpu(p), which is in the thread_info struct. - */ - if (cpu < NR_CPUS && cpu_possible(cpu)) { - stack_page = (unsigned long) hardirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - - stack_page = (unsigned long) softirq_ctx[cpu]; - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) - return 1; - } + stack_page = (unsigned long)hardirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + + stack_page = (unsigned long)softirq_ctx[cpu]; + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + return 0; } @@ -2018,8 +2010,10 @@ int validate_sp(unsigned long sp, struct task_struct *p, { unsigned long stack_page = (unsigned long)task_stack_page(p); - if (sp >= stack_page + sizeof(struct thread_struct) - && sp <= stack_page + THREAD_SIZE - nbytes) + if (sp < THREAD_SIZE) + return 0; + + if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes) return 1; return valid_irq_stack(sp, p, nbytes); @@ -2027,7 +2021,7 @@ int validate_sp(unsigned long sp, struct task_struct *p, EXPORT_SYMBOL(validate_sp); -unsigned long get_wchan(struct task_struct *p) +static unsigned long __get_wchan(struct task_struct *p) { unsigned long ip, sp; int count = 0; @@ -2053,6 +2047,20 @@ unsigned long get_wchan(struct task_struct *p) return 0; } +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ret; + + if (!try_get_task_stack(p)) + return 0; + + ret = __get_wchan(p); + + put_task_stack(p); + + return ret; +} + static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void show_stack(struct task_struct *tsk, unsigned long *stack) @@ -2067,9 +2075,13 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) int curr_frame = 0; #endif - sp = (unsigned long) stack; if (tsk == NULL) tsk = current; + + if (!try_get_task_stack(tsk)) + return; + + sp = (unsigned long) stack; if (sp == 0) { if (tsk == current) sp = current_stack_pointer(); @@ -2081,7 +2093,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) printk("Call Trace:\n"); do { if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) - return; + break; stack = (unsigned long *) sp; newsp = stack[0]; @@ -2121,6 +2133,8 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) sp = newsp; } while (count++ < kstack_depth_to_print); + + put_task_stack(tsk); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index cdd5d1d3ae41..d9ac7d94656e 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -33,6 +33,7 @@ #include <linux/hw_breakpoint.h> #include <linux/perf_event.h> #include <linux/context_tracking.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <linux/pkeys.h> @@ -274,6 +275,8 @@ static int set_user_trap(struct task_struct *task, unsigned long trap) */ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) { + unsigned int regs_max; + if ((task->thread.regs == NULL) || !data) return -EIO; @@ -297,7 +300,9 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) } #endif - if (regno < (sizeof(struct user_pt_regs) / sizeof(unsigned long))) { + regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); + if (regno < regs_max) { + regno = array_index_nospec(regno, regs_max); *data = ((unsigned long *)task->thread.regs)[regno]; return 0; } @@ -321,6 +326,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data) return set_user_dscr(task, data); if (regno <= PT_MAX_PUT_REG) { + regno = array_index_nospec(regno, PT_MAX_PUT_REG + 1); ((unsigned long *)task->thread.regs)[regno] = data; return 0; } @@ -561,6 +567,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, /* * Copy out only the low-order word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -569,8 +576,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); } return ret; @@ -608,6 +617,7 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, /* * We use only the first word of vrsave. */ + int start, end; union { elf_vrreg_t reg; u32 word; @@ -616,8 +626,10 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset, vrsave.word = target->thread.vrsave; + start = 33 * sizeof(vector128); + end = start + sizeof(vrsave); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave, - 33 * sizeof(vector128), -1); + start, end); if (!ret) target->thread.vrsave = vrsave.word; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 82be48c123cf..f17868e19e2c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -634,7 +634,7 @@ void probe_machine(void) } /* What can we do if we didn't find ? */ if (machine_id >= &__machine_desc_end) { - DBG("No suitable machine found !\n"); + pr_err("No suitable machine description found !\n"); for (;;); } @@ -791,7 +791,6 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) { pdev->archdata.dma_mask = DMA_BIT_MASK(32); pdev->dev.dma_mask = &pdev->archdata.dma_mask; - set_dma_ops(&pdev->dev, &dma_nommu_ops); } static __init void print_system_info(void) @@ -938,7 +937,7 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - klp_init_thread_info(&init_thread_info); + klp_init_thread_info(&init_task); init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index c31082233a25..4a65e08a6042 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -162,6 +162,17 @@ static int __init ppc_init(void) } arch_initcall(ppc_init); +static void *__init alloc_stack(void) +{ + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + + if (!ptr) + panic("cannot allocate %d bytes for stack at %pS\n", + THREAD_SIZE, (void *)_RET_IP_); + + return ptr; +} + void __init irqstack_early_init(void) { unsigned int i; @@ -169,10 +180,8 @@ void __init irqstack_early_init(void) /* interrupt stacks must be in lowmem, we get that for free on ppc32 * as the memblock is limited to lowmem by default */ for_each_possible_cpu(i) { - softirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - hardirq_ctx[i] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + softirq_ctx[i] = alloc_stack(); + hardirq_ctx[i] = alloc_stack(); } } @@ -190,13 +199,10 @@ void __init exc_lvl_early_init(void) hw_cpu = 0; #endif - critirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + critirq_ctx[hw_cpu] = alloc_stack(); #ifdef CONFIG_BOOKE - dbgirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); - mcheckirq_ctx[hw_cpu] = (struct thread_info *) - __va(memblock_phys_alloc(THREAD_SIZE, THREAD_SIZE)); + dbgirq_ctx[hw_cpu] = alloc_stack(); + mcheckirq_ctx[hw_cpu] = alloc_stack(); #endif } } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 5de413ae3cd6..ff0aac42bb33 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -634,19 +634,17 @@ __init u64 ppc64_bolted_size(void) static void *__init alloc_stack(unsigned long limit, int cpu) { - unsigned long pa; + void *ptr; BUILD_BUG_ON(STACK_INT_FRAME_SIZE % 16); - pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit, - early_cpu_to_node(cpu), MEMBLOCK_NONE); - if (!pa) { - pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); - if (!pa) - panic("cannot allocate stacks"); - } + ptr = memblock_alloc_try_nid(THREAD_SIZE, THREAD_SIZE, + MEMBLOCK_LOW_LIMIT, limit, + early_cpu_to_node(cpu)); + if (!ptr) + panic("cannot allocate stacks"); - return __va(pa); + return ptr; } void __init irqstack_early_init(void) @@ -692,24 +690,6 @@ void __init exc_lvl_early_init(void) #endif /* - * Emergency stacks are used for a range of things, from asynchronous - * NMIs (system reset, machine check) to synchronous, process context. - * We set preempt_count to zero, even though that isn't necessarily correct. To - * get the right value we'd need to copy it from the previous thread_info, but - * doing that might fault causing more problems. - * TODO: what to do with accounting? - */ -static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu) -{ - ti->task = NULL; - ti->cpu = cpu; - ti->preempt_count = 0; - ti->local_flags = 0; - ti->flags = 0; - klp_init_thread_info(ti); -} - -/* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. Exclusive emergency * stack for machine checks. @@ -736,25 +716,14 @@ void __init emergency_stack_init(void) limit = min(ppc64_bolted_size(), ppc64_rma_size); for_each_possible_cpu(i) { - struct thread_info *ti; - - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for NMI exception handling. */ - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->nmi_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; /* emergency stack for machine check exception handling. */ - ti = alloc_stack(limit, i); - memset(ti, 0, THREAD_SIZE); - emerg_stack_init_thread_info(ti, i); - paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE; + paca_ptrs[i]->mc_emergency_sp = alloc_stack(limit, i) + THREAD_SIZE; #endif } } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 3f15edf25a0d..e784342bdaa1 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -20,6 +20,7 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched/mm.h> +#include <linux/sched/task_stack.h> #include <linux/sched/topology.h> #include <linux/smp.h> #include <linux/interrupt.h> @@ -75,7 +76,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif -struct thread_info *secondary_ti; +struct task_struct *secondary_current; bool has_big_cores; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); @@ -358,13 +359,12 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) * NMI IPIs may not be recoverable, so should not be used as ongoing part of * a running system. They can be used for crash, debug, halt/reboot, etc. * - * NMI IPIs are globally single threaded. No more than one in progress at - * any time. - * * The IPI call waits with interrupts disabled until all targets enter the - * NMI handler, then the call returns. + * NMI handler, then returns. Subsequent IPIs can be issued before targets + * have returned from their handlers, so there is no guarantee about + * concurrency or re-entrancy. * - * No new NMI can be initiated until targets exit the handler. + * A new NMI can be issued before all targets exit the handler. * * The IPI call may time out without all targets entering the NMI handler. * In that case, there is some logic to recover (and ignore subsequent @@ -375,7 +375,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0); static struct cpumask nmi_ipi_pending_mask; -static int nmi_ipi_busy_count = 0; +static bool nmi_ipi_busy = false; static void (*nmi_ipi_function)(struct pt_regs *) = NULL; static void nmi_ipi_lock_start(unsigned long *flags) @@ -414,7 +414,7 @@ static void nmi_ipi_unlock_end(unsigned long *flags) */ int smp_handle_nmi_ipi(struct pt_regs *regs) { - void (*fn)(struct pt_regs *); + void (*fn)(struct pt_regs *) = NULL; unsigned long flags; int me = raw_smp_processor_id(); int ret = 0; @@ -425,29 +425,17 @@ int smp_handle_nmi_ipi(struct pt_regs *regs) * because the caller may have timed out. */ nmi_ipi_lock_start(&flags); - if (!nmi_ipi_busy_count) - goto out; - if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask)) - goto out; - - fn = nmi_ipi_function; - if (!fn) - goto out; - - cpumask_clear_cpu(me, &nmi_ipi_pending_mask); - nmi_ipi_busy_count++; - nmi_ipi_unlock(); - - ret = 1; - - fn(regs); - - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) /* Can race with caller time-out */ - nmi_ipi_busy_count--; -out: + if (cpumask_test_cpu(me, &nmi_ipi_pending_mask)) { + cpumask_clear_cpu(me, &nmi_ipi_pending_mask); + fn = READ_ONCE(nmi_ipi_function); + WARN_ON_ONCE(!fn); + ret = 1; + } nmi_ipi_unlock_end(&flags); + if (fn) + fn(regs); + return ret; } @@ -473,9 +461,10 @@ static void do_smp_send_nmi_ipi(int cpu, bool safe) * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to - * complete executing the handler, == 0 specifies indefinite delay. + * begin executing the handler, == 0 specifies indefinite delay. */ -int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool safe) +static int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), + u64 delay_us, bool safe) { unsigned long flags; int me = raw_smp_processor_id(); @@ -487,31 +476,33 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool if (unlikely(!smp_ops)) return 0; - /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */ nmi_ipi_lock_start(&flags); - while (nmi_ipi_busy_count) { + while (nmi_ipi_busy) { nmi_ipi_unlock_end(&flags); - spin_until_cond(nmi_ipi_busy_count == 0); + spin_until_cond(!nmi_ipi_busy); nmi_ipi_lock_start(&flags); } - + nmi_ipi_busy = true; nmi_ipi_function = fn; + WARN_ON_ONCE(!cpumask_empty(&nmi_ipi_pending_mask)); + if (cpu < 0) { /* ALL_OTHERS */ cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask); cpumask_clear_cpu(me, &nmi_ipi_pending_mask); } else { - /* cpumask starts clear */ cpumask_set_cpu(cpu, &nmi_ipi_pending_mask); } - nmi_ipi_busy_count++; + nmi_ipi_unlock(); + /* Interrupts remain hard disabled */ + do_smp_send_nmi_ipi(cpu, safe); nmi_ipi_lock(); - /* nmi_ipi_busy_count is held here, so unlock/lock is okay */ + /* nmi_ipi_busy is set here, so unlock/lock is okay */ while (!cpumask_empty(&nmi_ipi_pending_mask)) { nmi_ipi_unlock(); udelay(1); @@ -523,29 +514,15 @@ int __smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us, bool } } - while (nmi_ipi_busy_count > 1) { - nmi_ipi_unlock(); - udelay(1); - nmi_ipi_lock(); - if (delay_us) { - delay_us--; - if (!delay_us) - break; - } - } - if (!cpumask_empty(&nmi_ipi_pending_mask)) { /* Timeout waiting for CPUs to call smp_handle_nmi_ipi */ ret = 0; cpumask_clear(&nmi_ipi_pending_mask); } - if (nmi_ipi_busy_count > 1) { - /* Timeout waiting for CPUs to execute fn */ - ret = 0; - nmi_ipi_busy_count = 1; - } - nmi_ipi_busy_count--; + nmi_ipi_function = NULL; + nmi_ipi_busy = false; + nmi_ipi_unlock_end(&flags); return ret; @@ -613,17 +590,8 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) static void nmi_stop_this_cpu(struct pt_regs *regs) { /* - * This is a special case because it never returns, so the NMI IPI - * handling would never mark it as done, which makes any later - * smp_send_nmi_ipi() call spin forever. Mark it done now. - * * IRQs are already hard disabled by the smp_handle_nmi_ipi. */ - nmi_ipi_lock(); - if (nmi_ipi_busy_count > 1) - nmi_ipi_busy_count--; - nmi_ipi_unlock(); - spin_begin(); while (1) spin_cpu_relax(); @@ -663,7 +631,7 @@ void smp_send_stop(void) } #endif /* CONFIG_NMI_IPI */ -struct thread_info *current_set[NR_CPUS]; +struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { @@ -928,7 +896,7 @@ void smp_prepare_boot_cpu(void) paca_ptrs[boot_cpuid]->__current = current; #endif set_numa_node(numa_cpu_lookup_table[boot_cpuid]); - current_set[boot_cpuid] = task_thread_info(current); + current_set[boot_cpuid] = current; } #ifdef CONFIG_HOTPLUG_CPU @@ -1013,14 +981,13 @@ static bool secondaries_inhibited(void) static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) { - struct thread_info *ti = task_thread_info(idle); - #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; - paca_ptrs[cpu]->kstack = (unsigned long)ti + THREAD_SIZE - STACK_FRAME_OVERHEAD; + paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + + THREAD_SIZE - STACK_FRAME_OVERHEAD; #endif - ti->cpu = cpu; - secondary_ti = current_set[cpu] = ti; + idle->cpu = cpu; + secondary_current = current_set[cpu] = idle; } int __cpu_up(unsigned int cpu, struct task_struct *tidle) diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index e2c50b55138f..1e2276963f6d 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -67,12 +67,17 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { unsigned long sp; + if (!try_get_task_stack(tsk)) + return; + if (tsk == current) sp = current_stack_pointer(); else sp = tsk->thread.ksp; save_context_stack(trace, sp, tsk, 0); + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); @@ -84,25 +89,21 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) EXPORT_SYMBOL_GPL(save_stack_trace_regs); #ifdef CONFIG_HAVE_RELIABLE_STACKTRACE -int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +/* + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) { unsigned long sp; + unsigned long newsp; unsigned long stack_page = (unsigned long)task_stack_page(tsk); unsigned long stack_end; int graph_idx = 0; - - /* - * The last frame (unwinding first) may not yet have saved - * its LR onto the stack. - */ - int firstframe = 1; - - if (tsk == current) - sp = current_stack_pointer(); - else - sp = tsk->thread.ksp; + bool firstframe; stack_end = stack_page + THREAD_SIZE; if (!is_idle_task(tsk)) { @@ -129,40 +130,53 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, stack_end -= STACK_FRAME_OVERHEAD; } + if (tsk == current) + sp = current_stack_pointer(); + else + sp = tsk->thread.ksp; + if (sp < stack_page + sizeof(struct thread_struct) || sp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; + return -EINVAL; } - for (;;) { + for (firstframe = true; sp != stack_end; + firstframe = false, sp = newsp) { unsigned long *stack = (unsigned long *) sp; - unsigned long newsp, ip; + unsigned long ip; /* sanity check: ABI requires SP to be aligned 16 bytes. */ if (sp & 0xF) - return 1; - - /* Mark stacktraces with exception frames as unreliable. */ - if (sp <= stack_end - STACK_INT_FRAME_SIZE && - stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { - return 1; - } + return -EINVAL; newsp = stack[0]; /* Stack grows downwards; unwinder may only go up. */ if (newsp <= sp) - return 1; + return -EINVAL; if (newsp != stack_end && newsp > stack_end - STACK_FRAME_MIN_SIZE) { - return 1; /* invalid backlink, too far up. */ + return -EINVAL; /* invalid backlink, too far up. */ + } + + /* + * We can only trust the bottom frame's backlink, the + * rest of the frame may be uninitialized, continue to + * the next. + */ + if (firstframe) + continue; + + /* Mark stacktraces with exception frames as unreliable. */ + if (sp <= stack_end - STACK_INT_FRAME_SIZE && + stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + return -EINVAL; } /* Examine the saved LR: it must point into kernel code. */ ip = stack[STACK_FRAME_LR_SAVE]; - if (!firstframe && !__kernel_text_address(ip)) - return 1; - firstframe = 0; + if (!__kernel_text_address(ip)) + return -EINVAL; /* * FIXME: IMHO these tests do not belong in @@ -175,25 +189,37 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, * as unreliable. */ if (ip == (unsigned long)kretprobe_trampoline) - return 1; + return -EINVAL; #endif + if (trace->nr_entries >= trace->max_entries) + return -E2BIG; if (!trace->skip) trace->entries[trace->nr_entries++] = ip; else trace->skip--; + } + return 0; +} - if (newsp == stack_end) - break; +int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + int ret; - if (trace->nr_entries >= trace->max_entries) - return -E2BIG; + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; - sp = newsp; - } - return 0; + ret = __save_stack_trace_tsk_reliable(tsk, trace); + + put_task_stack(tsk); + + return ret; } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index e6982ab21816..e52a8878c2fb 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -123,7 +123,7 @@ long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, (u64)len_high << 32 | len_low, advice); } -long sys_switch_endian(void) +SYSCALL_DEFINE0(switch_endian) { struct thread_info *ti; diff --git a/arch/powerpc/kernel/syscalls/syscalltbl.sh b/arch/powerpc/kernel/syscalls/syscalltbl.sh index fd620490a542..f7393a7b18aa 100644 --- a/arch/powerpc/kernel/syscalls/syscalltbl.sh +++ b/arch/powerpc/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s,%s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 23265a28740b..02f28faba125 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -25,11 +25,11 @@ .globl sys_call_table sys_call_table: #ifdef CONFIG_PPC64 -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include <asm/syscall_table_64.h> #undef __SYSCALL #else -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL(nr, entry) .long entry #include <asm/syscall_table_32.h> #undef __SYSCALL #endif @@ -38,7 +38,7 @@ sys_call_table: .globl compat_sys_call_table compat_sys_call_table: #define compat_sys_sigsuspend sys_sigsuspend -#define __SYSCALL(nr, entry, nargs) .8byte DOTSYM(entry) +#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include <asm/syscall_table_c32.h> #undef __SYSCALL #endif diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 3646affae963..bc0503ef9c9c 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -57,7 +57,6 @@ #include <linux/irq_work.h> #include <linux/clk-provider.h> #include <linux/suspend.h> -#include <linux/rtc.h> #include <linux/sched/cputime.h> #include <linux/processor.h> #include <asm/trace.h> diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile index b1725ad3e13d..858503775c58 100644 --- a/arch/powerpc/kernel/trace/Makefile +++ b/arch/powerpc/kernel/trace/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -# Disable GCOV & sanitizers in odd or sensitive code +# Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_ftrace.o := n +KCOV_INSTRUMENT_ftrace.o := n UBSAN_SANITIZE_ftrace.o := n diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index 32476a6e4e9c..01b1224add49 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -229,7 +229,7 @@ ftrace_call: * - r0, r11 & r12 are free */ livepatch_handler: - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) /* Allocate 3 x 8 bytes */ ld r11, TI_livepatch_sp(r12) @@ -256,7 +256,7 @@ livepatch_handler: * restore it. */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) ld r11, TI_livepatch_sp(r12) @@ -273,7 +273,7 @@ livepatch_handler: ld r2, -24(r11) /* Pop livepatch stack frame */ - CURRENT_THREAD_INFO(r12, r1) + ld r12, PACA_THREAD_INFO(r13) subi r11, r11, 24 std r11, TI_livepatch_sp(r12) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 64936b60d521..a21200c6aaea 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -257,24 +257,17 @@ static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); - if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) - printk("LE "); - else - printk("BE "); - - if (IS_ENABLED(CONFIG_PREEMPT)) - pr_cont("PREEMPT "); - - if (IS_ENABLED(CONFIG_SMP)) - pr_cont("SMP NR_CPUS=%d ", NR_CPUS); - - if (debug_pagealloc_enabled()) - pr_cont("DEBUG_PAGEALLOC "); - - if (IS_ENABLED(CONFIG_NUMA)) - pr_cont("NUMA "); - - pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, + early_radix_enabled() ? " MMU=Radix" : "", + early_mmu_has_feature(MMU_FTR_HPTE_TABLE) ? " MMU=Hash" : "", + IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", + IS_ENABLED(CONFIG_NUMA) ? " NUMA" : "", + ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; @@ -376,16 +369,101 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) force_sig_fault(signr, code, (void __user *)addr, current); } +/* + * The interrupt architecture has a quirk in that the HV interrupts excluding + * the NMIs (0x100 and 0x200) do not clear MSR[RI] at entry. The first thing + * that an interrupt handler must do is save off a GPR into a scratch register, + * and all interrupts on POWERNV (HV=1) use the HSPRG1 register as scratch. + * Therefore an NMI can clobber an HV interrupt's live HSPRG1 without noticing + * that it is non-reentrant, which leads to random data corruption. + * + * The solution is for NMI interrupts in HV mode to check if they originated + * from these critical HV interrupt regions. If so, then mark them not + * recoverable. + * + * An alternative would be for HV NMIs to use SPRG for scratch to avoid the + * HSPRG1 clobber, however this would cause guest SPRG to be clobbered. Linux + * guests should always have MSR[RI]=0 when its scratch SPRG is in use, so + * that would work. However any other guest OS that may have the SPRG live + * and MSR[RI]=1 could encounter silent corruption. + * + * Builds that do not support KVM could take this second option to increase + * the recoverability of NMIs. + */ +void hv_nmi_check_nonrecoverable(struct pt_regs *regs) +{ +#ifdef CONFIG_PPC_POWERNV + unsigned long kbase = (unsigned long)_stext; + unsigned long nip = regs->nip; + + if (!(regs->msr & MSR_RI)) + return; + if (!(regs->msr & MSR_HV)) + return; + if (regs->msr & MSR_PR) + return; + + /* + * Now test if the interrupt has hit a range that may be using + * HSPRG1 without having RI=0 (i.e., an HSRR interrupt). The + * problem ranges all run un-relocated. Test real and virt modes + * at the same time by droping the high bit of the nip (virt mode + * entry points still have the +0x4000 offset). + */ + nip &= ~0xc000000000000000ULL; + if ((nip >= 0x500 && nip < 0x600) || (nip >= 0x4500 && nip < 0x4600)) + goto nonrecoverable; + if ((nip >= 0x980 && nip < 0xa00) || (nip >= 0x4980 && nip < 0x4a00)) + goto nonrecoverable; + if ((nip >= 0xe00 && nip < 0xec0) || (nip >= 0x4e00 && nip < 0x4ec0)) + goto nonrecoverable; + if ((nip >= 0xf80 && nip < 0xfa0) || (nip >= 0x4f80 && nip < 0x4fa0)) + goto nonrecoverable; + + /* Trampoline code runs un-relocated so subtract kbase. */ + if (nip >= (unsigned long)(start_real_trampolines - kbase) && + nip < (unsigned long)(end_real_trampolines - kbase)) + goto nonrecoverable; + if (nip >= (unsigned long)(start_virt_trampolines - kbase) && + nip < (unsigned long)(end_virt_trampolines - kbase)) + goto nonrecoverable; + return; + +nonrecoverable: + regs->msr &= ~MSR_RI; +#endif +} + void system_reset_exception(struct pt_regs *regs) { + unsigned long hsrr0, hsrr1; + bool nested = in_nmi(); + bool saved_hsrrs = false; + /* * Avoid crashes in case of nested NMI exceptions. Recoverability * is determined by RI and in_nmi */ - bool nested = in_nmi(); if (!nested) nmi_enter(); + /* + * System reset can interrupt code where HSRRs are live and MSR[RI]=1. + * The system reset interrupt itself may clobber HSRRs (e.g., to call + * OPAL), so save them here and restore them before returning. + * + * Machine checks don't need to save HSRRs, as the real mode handler + * is careful to avoid them, and the regular handler is not delivered + * as an NMI. + */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + hsrr0 = mfspr(SPRN_HSRR0); + hsrr1 = mfspr(SPRN_HSRR1); + saved_hsrrs = true; + } + + hv_nmi_check_nonrecoverable(regs); + __this_cpu_inc(irq_stat.sreset_irqs); /* See if any machine dependent calls */ @@ -433,6 +511,11 @@ out: if (!(regs->msr & MSR_RI)) nmi_panic(regs, "Unrecoverable System Reset"); + if (saved_hsrrs) { + mtspr(SPRN_HSRR0, hsrr0); + mtspr(SPRN_HSRR1, hsrr1); + } + if (!nested) nmi_exit(); @@ -763,15 +846,15 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) - nmi_panic(regs, "Unrecoverable Machine check"); - if (!nested) nmi_exit(); die("Machine check", regs, SIGBUS); + /* Must die if the interrupt is not recoverable */ + if (!(regs->msr & MSR_RI)) + nmi_panic(regs, "Unrecoverable Machine check"); + return; bail: @@ -1542,8 +1625,8 @@ bail: void StackOverflow(struct pt_regs *regs) { - printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n", - current, regs->gpr[1]); + pr_crit("Kernel stack overflow in process %s[%d], r1=%lx\n", + current->comm, task_pid_nr(current), regs->gpr[1]); debugger(regs); show_regs(regs); panic("kernel stack overflow"); diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 7cc38b5b58bc..8db4891acdaf 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -74,7 +74,7 @@ void __init udbg_early_init(void) #endif #ifdef CONFIG_PPC_EARLY_DEBUG - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; register_early_udbg_console(); #endif diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 50112d4473bb..ce199f6e4256 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -23,6 +23,7 @@ targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 69cecb346269..28e7d112aa2f 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -9,6 +9,7 @@ targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n ccflags-y := -shared -fno-common -fno-builtin diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ad1c77f71f54..060a1acd7c6d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -12,11 +12,8 @@ #include <asm/cache.h> #include <asm/thread_info.h> -#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) -#define STRICT_ALIGN_SIZE (1 << 24) -#else -#define STRICT_ALIGN_SIZE PAGE_SIZE -#endif +#define STRICT_ALIGN_SIZE (1 << CONFIG_DATA_SHIFT) +#define ETEXT_ALIGN_SIZE (1 << CONFIG_ETEXT_SHIFT) ENTRY(_stext) @@ -86,11 +83,11 @@ SECTIONS #ifdef CONFIG_PPC64 /* - * BLOCK(0) overrides the default output section alignment because + * ALIGN(0) overrides the default output section alignment because * this needs to start right after .head.text in order for fixed * section placement to work. */ - .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { + .text ALIGN(0) : AT(ADDR(.text) - LOAD_OFFSET) { #ifdef CONFIG_LD_HEAD_STUB_CATCH KEEP(*(.linker_stub_catch)); . = . ; @@ -131,7 +128,7 @@ SECTIONS } :kernel - . = ALIGN(PAGE_SIZE); + . = ALIGN(ETEXT_ALIGN_SIZE); _etext = .; PROVIDE32 (etext = .); @@ -319,6 +316,7 @@ SECTIONS *(.sdata2) *(.got.plt) *(.got) *(.plt) + *(.branch_lt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 64f1135e7732..3223aec88b2c 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -10,11 +10,6 @@ common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o -CFLAGS_e500_mmu.o := -I. -CFLAGS_e500_mmu_host.o := -I. -CFLAGS_emulate.o := -I. -CFLAGS_emulate_loadstore.o := -I. - common-objs-y += powerpc.o emulate_loadstore.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index bd1a677dd9e4..9a7dadbe1f17 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -192,6 +192,13 @@ void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) } EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); +void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags); +} +EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check); + void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) { /* might as well deliver this straight away */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5a066fc299e1..a3d5318f5d1e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1215,6 +1215,22 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); + + /* + * If the guest can do FWNMI, exit to userspace so it can + * deliver a FWNMI to the guest. + * Otherwise we synthesize a machine check for the guest + * so that it knows that the machine check occurred. + */ + if (!vcpu->kvm->arch.fwnmi_enabled) { + ulong flags = vcpu->arch.shregs.msr & 0x083c0000; + kvmppc_core_queue_machine_check(vcpu, flags); + r = RESUME_GUEST; + break; + } + /* Exit to guest with KVM_EXIT_NMI as exit reason */ run->exit_reason = KVM_EXIT_NMI; run->hw.hardware_exit_reason = vcpu->arch.trap; @@ -1227,8 +1243,6 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; r = RESUME_HOST; - /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { @@ -1392,7 +1406,7 @@ static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Pass the machine check to the L1 guest */ r = RESUME_HOST; /* Print the MCE event to host console. */ - machine_check_print_event_info(&vcpu->arch.mce_evt, false); + machine_check_print_event_info(&vcpu->arch.mce_evt, false, true); break; /* * We get these next two if the guest accesses a page which it thinks @@ -3455,6 +3469,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_dscr = mfspr(SPRN_DSCR); unsigned long host_tidr = mfspr(SPRN_TIDR); unsigned long host_iamr = mfspr(SPRN_IAMR); + unsigned long host_amr = mfspr(SPRN_AMR); s64 dec; u64 tb; int trap, save_pmu; @@ -3571,13 +3586,15 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_PSPB, 0); mtspr(SPRN_WORT, 0); - mtspr(SPRN_AMR, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_dscr); mtspr(SPRN_TIDR, host_tidr); mtspr(SPRN_IAMR, host_iamr); mtspr(SPRN_PSPB, 0); + if (host_amr != vcpu->arch.amr) + mtspr(SPRN_AMR, host_amr); + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); store_fp_state(&vcpu->arch.fp); #ifdef CONFIG_ALTIVEC diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c index e3f738eb1cac..64b5011475c7 100644 --- a/arch/powerpc/kvm/book3s_hv_hmi.c +++ b/arch/powerpc/kvm/book3s_hv_hmi.c @@ -24,6 +24,7 @@ #include <linux/compiler.h> #include <asm/paca.h> #include <asm/hmi.h> +#include <asm/processor.h> void wait_for_subcore_guest_exit(void) { diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 0787f12c1a1b..8c24c3bea0bf 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -66,10 +66,8 @@ static void reload_slb(struct kvm_vcpu *vcpu) /* * On POWER7, see if we can handle a machine check that occurred inside * the guest in real mode, without switching to the host partition. - * - * Returns: 0 => exit guest, 1 => deliver machine check to guest */ -static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) { unsigned long srr1 = vcpu->arch.shregs.msr; struct machine_check_event mce_evt; @@ -111,52 +109,24 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) } /* - * See if we have already handled the condition in the linux host. - * We assume that if the condition is recovered then linux host - * will have generated an error log event that we will pick - * up and log later. - * Don't release mce event now. We will queue up the event so that - * we can log the MCE event info on host console. + * Now get the event and stash it in the vcpu struct so it can + * be handled by the primary thread in virtual mode. We can't + * call machine_check_queue_event() here if we are running on + * an offline secondary thread. */ - if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) - goto out; - - if (mce_evt.version == MCE_V1 && - (mce_evt.severity == MCE_SEV_NO_ERROR || - mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) - handled = 1; - -out: - /* - * For guest that supports FWNMI capability, hook the MCE event into - * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI - * exit reason. On our way to exit we will pull this event from vcpu - * structure and print it from thread 0 of the core/subcore. - * - * For guest that does not support FWNMI capability (old QEMU): - * We are now going enter guest either through machine check - * interrupt (for unhandled errors) or will continue from - * current HSRR0 (for handled errors) in guest. Hence - * queue up the event so that we can log it from host console later. - */ - if (vcpu->kvm->arch.fwnmi_enabled) { - /* - * Hook up the mce event on to vcpu structure. - * First clear the old event. - */ - memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); - if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { - vcpu->arch.mce_evt = mce_evt; - } - } else - machine_check_queue_event(); + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + if (handled && mce_evt.version == MCE_V1) + mce_evt.disposition = MCE_DISPOSITION_RECOVERED; + } else { + memset(&mce_evt, 0, sizeof(mce_evt)); + } - return handled; + vcpu->arch.mce_evt = mce_evt; } -long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) { - return kvmppc_realmode_mc_power7(vcpu); + kvmppc_realmode_mc_power7(vcpu); } /* Check if dynamic split is in force and return subcore size accordingly. */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9b8d50a7cbaf..25043b50cb30 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -58,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +#define STACK_SLOT_AMR (SFS-80) +#define STACK_SLOT_UAMOR (SFS-88) /* the following is used by the P9 short path */ #define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ @@ -726,11 +728,9 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID - mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) - std r8, STACK_SLOT_IAMR(r1) mfspr r5, SPRN_HFSCR std r5, STACK_SLOT_HFSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) @@ -738,11 +738,18 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_CIABR mfspr r6, SPRN_DAWR mfspr r7, SPRN_DAWRX + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_CIABR(r1) std r6, STACK_SLOT_DAWR(r1) std r7, STACK_SLOT_DAWRX(r1) + std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r5, SPRN_AMR + std r5, STACK_SLOT_AMR(r1) + mfspr r6, SPRN_UAMOR + std r6, STACK_SLOT_UAMOR(r1) + BEGIN_FTR_SECTION /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -1631,22 +1638,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION - mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 sldi r0, r0, 31 mtspr SPRN_MMCRS, r0 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) -8: - /* Save and reset AMR and UAMOR before turning on the MMU */ + /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */ + ld r8, STACK_SLOT_IAMR(r1) + mtspr SPRN_IAMR, r8 + +8: /* Power7 jumps back in here */ mfspr r5,SPRN_AMR mfspr r6,SPRN_UAMOR std r5,VCPU_AMR(r9) std r6,VCPU_UAMOR(r9) - li r6,0 - mtspr SPRN_AMR,r6 + ld r5,STACK_SLOT_AMR(r1) + ld r6,STACK_SLOT_UAMOR(r1) + mtspr SPRN_AMR, r5 mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ @@ -1746,11 +1756,9 @@ BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) - ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 - mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_RADIX_MMU @@ -2826,49 +2834,15 @@ kvm_cede_exit: #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont - /* Try to handle a machine check in real mode */ + /* Try to do machine check recovery in real mode */ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop + /* all machine checks go to virtual mode for further handling */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK - /* - * For the guest that is FWNMI capable, deliver all the MCE errors - * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit - * reason. This new approach injects machine check errors in guest - * address space to guest with additional information in the form - * of RTAS event, thus enabling guest kernel to suitably handle - * such errors. - * - * For the guest that is not FWNMI capable (old QEMU) fallback - * to old behaviour for backward compatibility: - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either - * through machine check interrupt (set HSRR0 to 0x200). - * For handled errors (no-fatal), just go back to guest execution - * with current HSRR0. - * if we receive machine check with MSR(RI=0) then deliver it to - * guest as machine check causing guest to crash. - */ - ld r11, VCPU_MSR(r9) - rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne guest_exit_cont /* if so, exit to host */ - /* Check if guest is capable of handling NMI exit */ - ld r10, VCPU_KVM(r9) - lbz r10, KVM_FWNMI(r10) - cmpdi r10, 1 /* FWNMI capable? */ - beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ - - /* if not, fall through for backward compatibility. */ - andi. r10, r11, MSR_RI /* check for unrecoverable exception */ - beq 1f /* Deliver a machine check to guest */ - ld r10, VCPU_PC(r9) - cmpdi r3, 0 /* Did we handle MCE ? */ - bne 2f /* Continue guest execution. */ - /* If not, deliver a machine check. SRR0/1 are already set */ -1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - bl kvmppc_msr_interrupt -2: b fast_interrupt_c_return + b guest_exit_cont /* * Call C code to handle a HMI in real mode. diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 3bf9fc6fd36c..79396e184bca 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -30,7 +30,8 @@ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o -obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o +obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o \ + test_emulate_step_exec_instr.o obj-y += checksum_$(BITS).o checksum_wrappers.o \ string_$(BITS).o memcmp_$(BITS).o diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index d81568f783e5..3d33fb509ef4 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1169,7 +1169,7 @@ static nokprobe_inline int trap_compare(long v1, long v2) int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, unsigned int instr) { - unsigned int opcode, ra, rb, rd, spr, u; + unsigned int opcode, ra, rb, rc, rd, spr, u; unsigned long int imm; unsigned long int val, val2; unsigned int mb, me, sh; @@ -1292,6 +1292,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, rd = (instr >> 21) & 0x1f; ra = (instr >> 16) & 0x1f; rb = (instr >> 11) & 0x1f; + rc = (instr >> 6) & 0x1f; switch (opcode) { #ifdef __powerpc64__ @@ -1305,6 +1306,38 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, goto trap; return 1; +#ifdef __powerpc64__ + case 4: + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + + switch (instr & 0x3f) { + case 48: /* maddhd */ + asm volatile(PPC_MADDHD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 49: /* maddhdu */ + asm volatile(PPC_MADDHDU(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + + case 51: /* maddld */ + asm volatile(PPC_MADDLD(%0, %1, %2, %3) : + "=r" (op->val) : "r" (regs->gpr[ra]), + "r" (regs->gpr[rb]), "r" (regs->gpr[rc])); + goto compute_done; + } + + /* + * There are other instructions from ISA 3.0 with the same + * primary opcode which do not have emulation support yet. + */ + return -1; +#endif + case 7: /* mulli */ op->val = regs->gpr[ra] * (short) instr; goto compute_done; @@ -1671,10 +1704,23 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; - +#ifdef __powerpc64__ + case 265: /* modud */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = regs->gpr[ra] % regs->gpr[rb]; + goto compute_done; +#endif case 266: /* add */ op->val = regs->gpr[ra] + regs->gpr[rb]; goto arith_done; + + case 267: /* moduw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (unsigned int) regs->gpr[ra] % + (unsigned int) regs->gpr[rb]; + goto compute_done; #ifdef __powerpc64__ case 457: /* divdu */ op->val = regs->gpr[ra] / regs->gpr[rb]; @@ -1695,6 +1741,42 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, (int) regs->gpr[rb]; goto arith_done; + case 755: /* darn */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + switch (ra & 0x3) { + case 0: + /* 32-bit conditioned */ + asm volatile(PPC_DARN(%0, 0) : "=r" (op->val)); + goto compute_done; + + case 1: + /* 64-bit conditioned */ + asm volatile(PPC_DARN(%0, 1) : "=r" (op->val)); + goto compute_done; + + case 2: + /* 64-bit raw */ + asm volatile(PPC_DARN(%0, 2) : "=r" (op->val)); + goto compute_done; + } + + return -1; +#ifdef __powerpc64__ + case 777: /* modsd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (long int) regs->gpr[ra] % + (long int) regs->gpr[rb]; + goto compute_done; +#endif + case 779: /* modsw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->val = (int) regs->gpr[ra] % + (int) regs->gpr[rb]; + goto compute_done; + /* * Logical instructions @@ -1765,6 +1847,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, do_popcnt(regs, op, regs->gpr[rd], 64); goto logical_done_nocc; #endif + case 538: /* cnttzw */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = (unsigned int) regs->gpr[rd]; + op->val = (val ? __builtin_ctz(val) : 32); + goto logical_done; +#ifdef __powerpc64__ + case 570: /* cnttzd */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + val = regs->gpr[rd]; + op->val = (val ? __builtin_ctzl(val) : 64); + goto logical_done; +#endif case 922: /* extsh */ op->val = (signed short) regs->gpr[rd]; goto logical_done; @@ -1866,6 +1962,20 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval &= ~XER_CA; set_ca32(op, op->xerval & XER_CA); goto logical_done; + + case 890: /* extswsli with sh_5 = 0 */ + case 891: /* extswsli with sh_5 = 1 */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -1; + op->type = COMPUTE + SETREG; + sh = rb | ((instr & 2) << 4); + val = (signed int) regs->gpr[rd]; + if (sh) + op->val = ROTATE(val, sh) & MASK64(0, 63 - sh); + else + op->val = val; + goto logical_done; + #endif /* __powerpc64__ */ /* diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 6c47daa61614..9992c1ea7a1d 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -1,5 +1,5 @@ /* - * Simple sanity test for emulate_step load/store instructions. + * Simple sanity tests for instruction emulation infrastructure. * * Copyright IBM Corp. 2016 * @@ -14,6 +14,7 @@ #include <linux/ptrace.h> #include <asm/sstep.h> #include <asm/ppc-opcode.h> +#include <asm/code-patching.h> #define IMM_L(i) ((uintptr_t)(i) & 0xffff) @@ -48,7 +49,20 @@ ___PPC_RA(a) | ___PPC_RB(b)) #define TEST_LXVD2X(s, a, b) (PPC_INST_LXVD2X | VSX_XX1((s), R##a, R##b)) #define TEST_STXVD2X(s, a, b) (PPC_INST_STXVD2X | VSX_XX1((s), R##a, R##b)) +#define TEST_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) +#define TEST_ADDC(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define TEST_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | 0x1) + +#define MAX_SUBTESTS 16 +#define IGNORE_GPR(n) (0x1UL << (n)) +#define IGNORE_XER (0x1UL << 32) +#define IGNORE_CCR (0x1UL << 33) static void __init init_pt_regs(struct pt_regs *regs) { @@ -72,9 +86,15 @@ static void __init init_pt_regs(struct pt_regs *regs) msr_cached = true; } -static void __init show_result(char *ins, char *result) +static void __init show_result(char *mnemonic, char *result) { - pr_info("%-14s : %s\n", ins, result); + pr_info("%-14s : %s\n", mnemonic, result); +} + +static void __init show_result_with_descr(char *mnemonic, char *descr, + char *result) +{ + pr_info("%-14s : %-50s %s\n", mnemonic, descr, result); } static void __init test_ld(void) @@ -426,7 +446,7 @@ static void __init test_lxvd2x_stxvd2x(void) } #endif /* CONFIG_VSX */ -static int __init test_emulate_step(void) +static void __init run_tests_load_store(void) { test_ld(); test_lwz(); @@ -437,6 +457,513 @@ static int __init test_emulate_step(void) test_lfdx_stfdx(); test_lvx_stvx(); test_lxvd2x_stxvd2x(); +} + +struct compute_test { + char *mnemonic; + struct { + char *descr; + unsigned long flags; + unsigned int instr; + struct pt_regs regs; + } subtests[MAX_SUBTESTS + 1]; +}; + +static struct compute_test compute_tests[] = { + { + .mnemonic = "nop", + .subtests = { + { + .descr = "R0 = LONG_MAX", + .instr = PPC_INST_NOP, + .regs = { + .gpr[0] = LONG_MAX, + } + } + } + }, + { + .mnemonic = "add", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "add.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADD_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + } + } + }, + { + .mnemonic = "addc", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + }, + { + .mnemonic = "addc.", + .subtests = { + { + .descr = "RA = LONG_MIN, RB = LONG_MIN", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MIN, + } + }, + { + .descr = "RA = LONG_MIN, RB = LONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = LONG_MAX, RB = LONG_MAX", + .flags = IGNORE_CCR, + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MAX, + .gpr[22] = LONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = ULONG_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = ULONG_MAX, + } + }, + { + .descr = "RA = ULONG_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = ULONG_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MIN, + } + }, + { + .descr = "RA = INT_MIN, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MIN, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = INT_MAX, RB = INT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = INT_MAX, + .gpr[22] = INT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = UINT_MAX", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = UINT_MAX, + } + }, + { + .descr = "RA = UINT_MAX, RB = 0x1", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = UINT_MAX, + .gpr[22] = 0x1, + } + }, + { + .descr = "RA = LONG_MIN | INT_MIN, RB = LONG_MIN | INT_MIN", + .instr = TEST_ADDC_DOT(20, 21, 22), + .regs = { + .gpr[21] = LONG_MIN | (uint)INT_MIN, + .gpr[22] = LONG_MIN | (uint)INT_MIN, + } + } + } + } +}; + +static int __init emulate_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + struct instruction_op op; + + if (!regs || !instr) + return -EINVAL; + + if (analyse_instr(&op, regs, instr) != 1 || + GETTYPE(op.type) != COMPUTE) { + pr_info("emulation failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + emulate_update_regs(regs, &op); + return 0; +} + +static int __init execute_compute_instr(struct pt_regs *regs, + unsigned int instr) +{ + extern int exec_instr(struct pt_regs *regs); + extern s32 patch__exec_instr; + + if (!regs || !instr) + return -EINVAL; + + /* Patch the NOP with the actual instruction */ + patch_instruction_site(&patch__exec_instr, instr); + if (exec_instr(regs)) { + pr_info("execution failed, instruction = 0x%08x\n", instr); + return -EFAULT; + } + + return 0; +} + +#define gpr_mismatch(gprn, exp, got) \ + pr_info("GPR%u mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + gprn, exp, got) + +#define reg_mismatch(name, exp, got) \ + pr_info("%s mismatch, exp = 0x%016lx, got = 0x%016lx\n", \ + name, exp, got) + +static void __init run_tests_compute(void) +{ + unsigned long flags; + struct compute_test *test; + struct pt_regs *regs, exp, got; + unsigned int i, j, k, instr; + bool ignore_gpr, ignore_xer, ignore_ccr, passed; + + for (i = 0; i < ARRAY_SIZE(compute_tests); i++) { + test = &compute_tests[i]; + + for (j = 0; j < MAX_SUBTESTS && test->subtests[j].descr; j++) { + instr = test->subtests[j].instr; + flags = test->subtests[j].flags; + regs = &test->subtests[j].regs; + ignore_xer = flags & IGNORE_XER; + ignore_ccr = flags & IGNORE_CCR; + passed = true; + + memcpy(&exp, regs, sizeof(struct pt_regs)); + memcpy(&got, regs, sizeof(struct pt_regs)); + + /* + * Set a compatible MSR value explicitly to ensure + * that XER and CR bits are updated appropriately + */ + exp.msr = MSR_KERNEL; + got.msr = MSR_KERNEL; + + if (emulate_compute_instr(&got, instr) || + execute_compute_instr(&exp, instr)) { + passed = false; + goto print; + } + + /* Verify GPR values */ + for (k = 0; k < 32; k++) { + ignore_gpr = flags & IGNORE_GPR(k); + if (!ignore_gpr && exp.gpr[k] != got.gpr[k]) { + passed = false; + gpr_mismatch(k, exp.gpr[k], got.gpr[k]); + } + } + + /* Verify LR value */ + if (exp.link != got.link) { + passed = false; + reg_mismatch("LR", exp.link, got.link); + } + + /* Verify XER value */ + if (!ignore_xer && exp.xer != got.xer) { + passed = false; + reg_mismatch("XER", exp.xer, got.xer); + } + + /* Verify CR value */ + if (!ignore_ccr && exp.ccr != got.ccr) { + passed = false; + reg_mismatch("CR", exp.ccr, got.ccr); + } + +print: + show_result_with_descr(test->mnemonic, + test->subtests[j].descr, + passed ? "PASS" : "FAIL"); + } + } +} + +static int __init test_emulate_step(void) +{ + printk(KERN_INFO "Running instruction emulation self-tests ...\n"); + run_tests_load_store(); + run_tests_compute(); return 0; } diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S new file mode 100644 index 000000000000..1580f34f4f4f --- /dev/null +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Non-emulated single-stepping support (currently limited to basic integer + * computations) used to validate the instruction emulation infrastructure. + * + * Copyright (C) 2019 IBM Corporation + */ + +#include <asm/asm-offsets.h> +#include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> +#include <linux/errno.h> + +/* int exec_instr(struct pt_regs *regs) */ +_GLOBAL(exec_instr) + + /* + * Stack frame layout (INT_FRAME_SIZE bytes) + * In-memory pt_regs (SP + STACK_FRAME_OVERHEAD) + * Scratch space (SP + 8) + * Back chain (SP + 0) + */ + + /* + * Allocate a new stack frame with enough space to hold the register + * states in an in-memory pt_regs and also create the back chain to + * the caller's stack frame. + */ + stdu r1, -INT_FRAME_SIZE(r1) + + /* + * Save non-volatile GPRs on stack. This includes TOC pointer (GPR2) + * and local variables (GPR14 to GPR31). The register for the pt_regs + * parameter (GPR3) is saved additionally to ensure that the resulting + * register state can still be saved even if GPR3 gets overwritten + * when loading the initial register state for the test instruction. + * The stack pointer (GPR1) and the thread pointer (GPR13) are not + * saved as these should not be modified anyway. + */ + SAVE_2GPRS(2, r1) + SAVE_NVGPRS(r1) + + /* + * Save LR on stack to ensure that the return address is available + * even if it gets overwritten by the test instruction. + */ + mflr r0 + std r0, _LINK(r1) + + /* + * Save CR on stack. For simplicity, the entire register is saved + * even though only fields 2 to 4 are non-volatile. + */ + mfcr r0 + std r0, _CCR(r1) + + /* + * Load register state for the test instruction without touching the + * critical non-volatile registers. The register state is passed as a + * pointer to a pt_regs instance. + */ + subi r31, r3, GPR0 + + /* Load LR from pt_regs */ + ld r0, _LINK(r31) + mtlr r0 + + /* Load CR from pt_regs */ + ld r0, _CCR(r31) + mtcr r0 + + /* Load XER from pt_regs */ + ld r0, _XER(r31) + mtxer r0 + + /* Load GPRs from pt_regs */ + REST_GPR(0, r31) + REST_10GPRS(2, r31) + REST_GPR(12, r31) + REST_NVGPRS(r31) + + /* Placeholder for the test instruction */ +1: nop + patch_site 1b patch__exec_instr + + /* + * Since GPR3 is overwritten, temporarily restore it back to its + * original state, i.e. the pointer to pt_regs, to ensure that the + * resulting register state can be saved. Before doing this, a copy + * of it is created in the scratch space which is used later on to + * save it to pt_regs. + */ + std r3, 8(r1) + REST_GPR(3, r1) + + /* Save resulting GPR state to pt_regs */ + subi r3, r3, GPR0 + SAVE_GPR(0, r3) + SAVE_GPR(2, r3) + SAVE_8GPRS(4, r3) + SAVE_GPR(12, r3) + SAVE_NVGPRS(r3) + + /* Save resulting LR to pt_regs */ + mflr r0 + std r0, _LINK(r3) + + /* Save resulting CR to pt_regs */ + mfcr r0 + std r0, _CCR(r3) + + /* Save resulting XER to pt_regs */ + mfxer r0 + std r0, _XER(r3) + + /* Restore resulting GPR3 from scratch space and save it to pt_regs */ + ld r0, 8(r1) + std r0, GPR3(r3) + + /* Set return value to denote execution success */ + li r3, 0 + + /* Continue */ + b 3f + + /* Set return value to denote execution failure */ +2: li r3, -EFAULT + + /* Restore the non-volatile GPRs from stack */ +3: REST_GPR(2, r1) + REST_NVGPRS(r1) + + /* Restore LR from stack to be able to return */ + ld r0, _LINK(r1) + mtlr r0 + + /* Restore CR from stack */ + ld r0, _CCR(r1) + mtcr r0 + + /* Tear down stack frame */ + addi r1, r1, INT_FRAME_SIZE + + /* Return */ + blr + + /* Setup exception table */ + EX_TABLE(1b, 2b) + +_ASM_NOKPROBE_SYMBOL(exec_instr) diff --git a/arch/powerpc/math-emu/Makefile b/arch/powerpc/math-emu/Makefile index 494df26c5988..a8794032f15f 100644 --- a/arch/powerpc/math-emu/Makefile +++ b/arch/powerpc/math-emu/Makefile @@ -17,4 +17,4 @@ obj-$(CONFIG_SPE) += math_efp.o CFLAGS_fabs.o = -fno-builtin-fabs CFLAGS_math.o = -fno-builtin-fabs -ccflags-y = -I. -Iinclude/math-emu -w +ccflags-y = -w diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 61ac468c87c6..b9cf6f8764b0 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -93,7 +93,7 @@ void __init MMU_init_hw(void) #define LARGE_PAGE_SIZE_16M (1<<24) #define LARGE_PAGE_SIZE_4M (1<<22) -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long v, s, mapped; phys_addr_t p; diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c index ea2b9af08a48..aad127acdbaa 100644 --- a/arch/powerpc/mm/44x_mmu.c +++ b/arch/powerpc/mm/44x_mmu.c @@ -170,7 +170,7 @@ void __init MMU_init_hw(void) flush_instruction_cache(); } -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long addr; unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index bfa503cff351..fe1f6443d57f 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa) void __init MMU_init_hw(void) { /* PIN up to the 3 first 8Mb after IMMR in DTLB table */ -#ifdef CONFIG_PIN_TLB_DATA - unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; - unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; -#ifdef CONFIG_PIN_TLB_IMMR - int i = 29; -#else - int i = 28; -#endif - unsigned long addr = 0; - unsigned long mem = total_lowmem; - - for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { - mtspr(SPRN_MD_CTR, ctr | (i << 8)); - mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); - mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); - mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); - addr += LARGE_PAGE_SIZE_8M; - mem -= LARGE_PAGE_SIZE_8M; + if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) { + unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe000000; + unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY; + int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28; + unsigned long addr = 0; + unsigned long mem = total_lowmem; + + for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) { + mtspr(SPRN_MD_CTR, ctr | (i << 8)); + mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID); + mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID); + mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT); + addr += LARGE_PAGE_SIZE_8M; + mem -= LARGE_PAGE_SIZE_8M; + } } -#endif } static void __init mmu_mapin_immr(void) @@ -98,26 +94,36 @@ static void __init mmu_mapin_immr(void) map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG); } -static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped) +static void mmu_patch_cmp_limit(s32 *site, unsigned long mapped) { modify_instruction_site(site, 0xffff, (unsigned long)__va(mapped) >> 16); } -unsigned long __init mmu_mapin_ram(unsigned long top) +static void mmu_patch_addis(s32 *site, long simm) +{ + unsigned int instr = *(unsigned int *)patch_site_addr(site); + + instr &= 0xffff0000; + instr |= ((unsigned long)simm) >> 16; + patch_instruction_site(site, instr); +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { unsigned long mapped; if (__map_without_ltlbs) { mapped = 0; mmu_mapin_immr(); -#ifndef CONFIG_PIN_TLB_IMMR - patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); -#endif -#ifndef CONFIG_PIN_TLB_TEXT - mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); -#endif + if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR)) + patch_instruction_site(&patch__dtlbmiss_immr_jmp, PPC_INST_NOP); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, 0); } else { mapped = top & ~(LARGE_PAGE_SIZE_8M - 1); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, + _ALIGN(__pa(_einittext), 8 << 20)); } mmu_patch_cmp_limit(&patch__dtlbmiss_linmem_top, mapped); @@ -138,6 +144,26 @@ unsigned long __init mmu_mapin_ram(unsigned long top) return mapped; } +void mmu_mark_initmem_nx(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) && CONFIG_ETEXT_SHIFT < 23) + mmu_patch_addis(&patch__itlbmiss_linmem_top8, + -((long)_etext & ~(LARGE_PAGE_SIZE_8M - 1))); + if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT)) + mmu_patch_cmp_limit(&patch__itlbmiss_linmem_top, __pa(_etext)); +} + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mmu_mark_rodata_ro(void) +{ + if (CONFIG_DATA_SHIFT < 23) + mmu_patch_addis(&patch__dtlbmiss_romem_top8, + -__pa(((unsigned long)_sinittext) & + ~(LARGE_PAGE_SIZE_8M - 1))); + mmu_patch_addis(&patch__dtlbmiss_romem_top, -__pa(_sinittext)); +} +#endif + void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { @@ -146,8 +172,8 @@ void __init setup_initial_memory_limit(phys_addr_t first_memblock_base, */ BUG_ON(first_memblock_base != 0); - /* 8xx can only access 24MB at the moment */ - memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); + /* 8xx can only access 32MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x02000000)); } /* @@ -162,14 +188,11 @@ void set_context(unsigned long id, pgd_t *pgd) { s16 offset = (s16)(__pa(swapper_pg_dir)); -#ifdef CONFIG_BDI_SWITCH - pgd_t **ptr = *(pgd_t ***)(KERNELBASE + 0xf0); - /* Context switch the PTE pointer for the Abatron BDI2000. * The PGDIR is passed as second argument. */ - *(ptr + 1) = pgd; -#endif + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + abatron_pteptrs[1] = pgd; /* Register M_TWB will contain base address of level 1 table minus the * lower part of the kernel PGDIR base address, so that all accesses to diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index f965fc33a8b7..d52ec118e09d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -45,13 +45,10 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o -obj-$(CONFIG_PPC_PTDUMP) += dump_linuxpagetables.o -ifdef CONFIG_PPC_PTDUMP -obj-$(CONFIG_4xx) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_8xx) += dump_linuxpagetables-8xx.o -obj-$(CONFIG_PPC_BOOK3E_MMU) += dump_linuxpagetables-generic.o -obj-$(CONFIG_PPC_BOOK3S_32) += dump_linuxpagetables-generic.o dump_bats.o dump_sr.o -obj-$(CONFIG_PPC_BOOK3S_64) += dump_linuxpagetables-book3s64.o -endif -obj-$(CONFIG_PPC_HTDUMP) += dump_hashpagetable.o +obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o + +# Disable kcov instrumentation on sensitive code +# This is necessary for booting with kcov enabled on book3e machines +KCOV_INSTRUMENT_tlb_nohash.o := n +KCOV_INSTRUMENT_fsl_booke_mmu.o := n diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index e955539686a4..b5d2658c26af 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -30,6 +30,7 @@ #include <linux/types.h> #include <linux/highmem.h> #include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> #include <linux/export.h> #include <asm/tlbflush.h> @@ -151,8 +152,8 @@ static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsi * Allocate DMA-coherent memory space and return both the kernel remapped * virtual and bus address for that space. */ -void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) { struct page *page; struct ppc_vm_region *c; @@ -253,7 +254,7 @@ void *__dma_nommu_alloc_coherent(struct device *dev, size_t size, /* * free a page as defined by the above mapping. */ -void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, +void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { struct ppc_vm_region *c; @@ -313,7 +314,7 @@ void __dma_nommu_free_coherent(struct device *dev, size_t size, void *vaddr, /* * make an area consistent. */ -void __dma_sync(void *vaddr, size_t size, int direction) +static void __dma_sync(void *vaddr, size_t size, int direction) { unsigned long start = (unsigned long)vaddr; unsigned long end = start + size; @@ -339,7 +340,6 @@ void __dma_sync(void *vaddr, size_t size, int direction) break; } } -EXPORT_SYMBOL(__dma_sync); #ifdef CONFIG_HIGHMEM /* @@ -386,28 +386,42 @@ static inline void __dma_sync_page_highmem(struct page *page, * __dma_sync_page makes memory consistent. identical to __dma_sync, but * takes a struct page instead of a virtual address */ -void __dma_sync_page(struct page *page, unsigned long offset, - size_t size, int direction) +static void __dma_sync_page(phys_addr_t paddr, size_t size, int dir) { + struct page *page = pfn_to_page(paddr >> PAGE_SHIFT); + unsigned offset = paddr & ~PAGE_MASK; + #ifdef CONFIG_HIGHMEM - __dma_sync_page_highmem(page, offset, size, direction); + __dma_sync_page_highmem(page, offset, size, dir); #else unsigned long start = (unsigned long)page_address(page) + offset; - __dma_sync((void *)start, size, direction); + __dma_sync((void *)start, size, dir); #endif } -EXPORT_SYMBOL(__dma_sync_page); + +void arch_sync_dma_for_device(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} + +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) +{ + __dma_sync_page(paddr, size, dir); +} /* - * Return the PFN for a given cpu virtual address returned by - * __dma_nommu_alloc_coherent. This is used by dma_mmap_coherent() + * Return the PFN for a given cpu virtual address returned by arch_dma_alloc. */ -unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +long arch_dma_coherent_to_pfn(struct device *dev, void *vaddr, + dma_addr_t dma_addr) { /* This should always be populated, so we don't test every * level. If that fails, we'll have a nice crash which * will be as good as a BUG_ON() */ + unsigned long cpu_addr = (unsigned long)vaddr; pgd_t *pgd = pgd_offset_k(cpu_addr); pud_t *pud = pud_offset(pgd, cpu_addr); pmd_t *pmd = pmd_offset(pud, cpu_addr); diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index 080d49b26c3a..210cbc1faf63 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -221,7 +221,7 @@ unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun) #error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" #endif -unsigned long __init mmu_mapin_ram(unsigned long top) +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; } diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 1e2df3e9f9ea..1f13494efb2b 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -47,14 +47,13 @@ mmu_hash_lock: * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE * in the hash table and returns from the exception. - * Uses r0, r3 - r8, r10, ctr, lr. + * Uses r0, r3 - r6, r8, r10, ctr, lr. */ .text _GLOBAL(hash_page) - tophys(r7,0) /* gets -KERNELBASE into r7 */ #ifdef CONFIG_SMP - addis r8,r7,mmu_hash_lock@h - ori r8,r8,mmu_hash_lock@l + lis r8, (mmu_hash_lock - PAGE_OFFSET)@h + ori r8, r8, (mmu_hash_lock - PAGE_OFFSET)@l lis r0,0x0fff b 10f 11: lwz r6,0(r8) @@ -70,14 +69,13 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - lwz r5,PGDIR(r8) /* virt page-table root */ + mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: add r5,r5,r7 /* convert to phys addr */ +112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5) /* get pmd entry */ @@ -144,25 +142,24 @@ retry: #ifdef CONFIG_SMP eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) #endif /* Return from the exception */ lwz r5,_CTR(r11) mtctr r5 lwz r0,GPR0(r11) - lwz r7,GPR7(r11) lwz r8,GPR8(r11) b fast_exception_return #ifdef CONFIG_SMP hash_page_out: eieio - addis r8,r7,mmu_hash_lock@ha + lis r8, (mmu_hash_lock - PAGE_OFFSET)@ha li r0,0 - stw r0,mmu_hash_lock@l(r8) + stw r0, (mmu_hash_lock - PAGE_OFFSET)@l(r8) blr #endif /* CONFIG_SMP */ @@ -186,8 +183,7 @@ _GLOBAL(add_hash_page) add r3,r3,r0 /* note create_hpte trims to 24 bits */ #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ - lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + lwz r8,TASK_CPU(r2) /* to go in mmu_hash_lock */ oris r8,r8,12 #endif /* CONFIG_SMP */ @@ -208,11 +204,9 @@ _GLOBAL(add_hash_page) SYNC_601 isync - tophys(r7,0) - #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l 10: lwarx r0,0,r6 /* take the mmu_hash_lock */ cmpi 0,r0,0 bne- 11f @@ -257,8 +251,8 @@ _GLOBAL(add_hash_page) 9: #ifdef CONFIG_SMP - addis r6,r7,mmu_hash_lock@ha - addi r6,r6,mmu_hash_lock@l + lis r6, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r6, r6, (mmu_hash_lock - PAGE_OFFSET)@l eieio li r0,0 stw r0,0(r6) /* clear mmu_hash_lock */ @@ -278,10 +272,8 @@ _GLOBAL(add_hash_page) * It is designed to be called with the MMU either on or off. * r3 contains the VSID, r4 contains the virtual address, * r5 contains the linux PTE, r6 contains the old value of the - * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the - * offset to be added to addresses (0 if the MMU is on, - * -KERNELBASE if it is off). r10 contains the upper half of - * the PTE if CONFIG_PTE_64BIT. + * linux PTE (before setting _PAGE_HASHPTE). r10 contains the + * upper half of the PTE if CONFIG_PTE_64BIT. * On SMP, the caller should have the mmu_hash_lock held. * We assume that the caller has (or will) set the _PAGE_HASHPTE * bit in the linux PTE in memory. The value passed in r6 should @@ -342,7 +334,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) patch_site 1f, patch__hash_page_A1 patch_site 2f, patch__hash_page_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r0,r7,Hash_base@h /* base address of hash table */ +0: lis r0, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r3,r3,r0 /* make primary hash */ @@ -356,10 +348,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ 10f /* no PTE: go look for an empty slot */ tlbie r4 - addis r4,r7,htab_hash_searches@ha - lwz r6,htab_hash_searches@l(r4) + lis r4, (htab_hash_searches - PAGE_OFFSET)@ha + lwz r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) addi r6,r6,1 /* count how many searches we do */ - stw r6,htab_hash_searches@l(r4) + stw r6, (htab_hash_searches - PAGE_OFFSET)@l(r4) /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ mtctr r0 @@ -391,10 +383,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) beq+ found_empty /* update counter of times that the primary PTEG is full */ - addis r4,r7,primary_pteg_full@ha - lwz r6,primary_pteg_full@l(r4) + lis r4, (primary_pteg_full - PAGE_OFFSET)@ha + lwz r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) addi r6,r6,1 - stw r6,primary_pteg_full@l(r4) + stw r6, (primary_pteg_full - PAGE_OFFSET)@l(r4) patch_site 0f, patch__hash_page_C /* Search the secondary PTEG for an empty slot */ @@ -428,8 +420,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) * lockup here but that shouldn't happen */ -1: addis r4,r7,next_slot@ha /* get next evict slot */ - lwz r6,next_slot@l(r4) +1: lis r4, (next_slot - PAGE_OFFSET)@ha /* get next evict slot */ + lwz r6, (next_slot - PAGE_OFFSET)@l(r4) addi r6,r6,HPTE_SIZE /* search for candidate */ andi. r6,r6,7*HPTE_SIZE stw r6,next_slot@l(r4) @@ -501,8 +493,6 @@ htab_hash_searches: * We assume that there is a hash table in use (Hash != 0). */ _GLOBAL(flush_hash_pages) - tophys(r7,0) - /* * We disable interrupts here, even on UP, because we want * the _PAGE_HASHPTE bit to be a reliable indication of @@ -547,11 +537,9 @@ _GLOBAL(flush_hash_pages) SET_V(r11) /* set V (valid) bit */ #ifdef CONFIG_SMP - addis r9,r7,mmu_hash_lock@ha - addi r9,r9,mmu_hash_lock@l - CURRENT_THREAD_INFO(r8, r1) - add r8,r8,r7 - lwz r8,TI_CPU(r8) + lis r9, (mmu_hash_lock - PAGE_OFFSET)@ha + addi r9, r9, (mmu_hash_lock - PAGE_OFFSET)@l + lwz r8,TASK_CPU(r2) oris r8,r8,9 10: lwarx r0,0,r9 cmpi 0,r0,0 @@ -584,7 +572,7 @@ _GLOBAL(flush_hash_pages) patch_site 1f, patch__flush_hash_A1 patch_site 2f, patch__flush_hash_A2 /* Get the address of the primary PTE group in the hash table (r3) */ -0: addis r8,r7,Hash_base@h /* base address of hash table */ +0: lis r8, (Hash_base - PAGE_OFFSET)@h /* base address of hash table */ 1: rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ 2: rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ xor r8,r0,r8 /* make primary hash */ @@ -646,8 +634,7 @@ EXPORT_SYMBOL(flush_hash_pages) */ _GLOBAL(_tlbie) #ifdef CONFIG_SMP - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,11 mfmsr r10 SYNC @@ -684,8 +671,7 @@ _GLOBAL(_tlbie) */ _GLOBAL(_tlbia) #if defined(CONFIG_SMP) - CURRENT_THREAD_INFO(r8, r1) - lwz r8,TI_CPU(r8) + lwz r8,TASK_CPU(r2) oris r8,r8,10 mfmsr r10 SYNC diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index bc6be44913d4..3d4b2399192f 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1889,12 +1889,12 @@ static int hpt_order_set(void *data, u64 val) return mmu_hash_ops.resize_hpt(val); } -DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); static int __init hash64_debugfs(void) { - if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, - NULL, &fops_hpt_order)) { + if (!debugfs_create_file_unsafe("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { pr_err("lpar: unable to create hpt_order debugsfs file\n"); } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 367ce3a4a503..b0d9209d9a86 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -26,7 +26,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, real_pte_t rpte; unsigned long vpn; unsigned long old_pte, new_pte; - unsigned long rflags, pa, sz; + unsigned long rflags, pa; long slot, offset; BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); @@ -73,7 +73,6 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, offset = PTRS_PER_PMD; rpte = __real_pte(__pte(old_pte), ptep, offset); - sz = ((1UL) << shift); if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) /* No CPU has hugepages but lacks no execute, so we * don't need to worry about that case */ diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 11d9ea28a816..cab06331c0c0 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/hugetlb.h> +#include <linux/security.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> @@ -73,7 +74,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (high_limit - len >= addr && + if (high_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -83,7 +84,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, */ info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3e59e5d64b01..41a3513cadc9 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -108,12 +108,8 @@ static void __init MMU_setup(void) __map_without_bats = 1; __map_without_ltlbs = 1; } -#ifdef CONFIG_STRICT_KERNEL_RWX - if (rodata_enabled) { - __map_without_bats = 1; + if (strict_kernel_rwx_enabled() && !IS_ENABLED(CONFIG_PPC_8xx)) __map_without_ltlbs = 1; - } -#endif } /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index a5091c034747..a4c155af1597 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -274,7 +274,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, for (; start < end; start += page_size) { unsigned long nr_pages, addr; - struct page *section_base; struct page *page; /* @@ -290,7 +289,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, continue; page = pfn_to_page(addr >> PAGE_SHIFT); - section_base = pfn_to_page(vmemmap_section_start(start)); nr_pages = 1 << page_order; base_pfn = PHYS_PFN(addr); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 33cc6f676fa6..f6787f90e158 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -69,22 +69,14 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); -#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } -#else -#define TOP_ZONE ZONE_NORMAL #endif -int page_is_ram(unsigned long pfn) -{ - return memblock_is_memory(__pfn_to_phys(pfn)); -} - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -176,34 +168,6 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, #endif #endif /* CONFIG_MEMORY_HOTPLUG */ -/* - * walk_memory_resource() needs to make sure there is no holes in a given - * memory range. PPC64 does not maintain the memory layout in /proc/iomem. - * Instead it maintains it in memblock.memory structures. Walk through the - * memory regions, find holes and callback for contiguous regions. - */ -int -walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, - void *arg, int (*func)(unsigned long, unsigned long, void *)) -{ - struct memblock_region *reg; - unsigned long end_pfn = start_pfn + nr_pages; - unsigned long tstart, tend; - int ret = -1; - - for_each_memblock(memory, reg) { - tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); - tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); - if (tstart >= tend) - continue; - ret = (*func)(tstart, tend - tstart, arg); - if (ret) - break; - } - return ret; -} -EXPORT_SYMBOL_GPL(walk_system_ram_range); - #ifndef CONFIG_NEED_MULTIPLE_NODES void __init mem_topology_setup(void) { @@ -262,25 +226,6 @@ static int __init mark_nonram_nosave(void) static unsigned long max_zone_pfns[MAX_NR_ZONES]; /* - * Find the least restrictive zone that is entirely below the - * specified pfn limit. Returns < 0 if no suitable zone is found. - * - * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit - * systems -- the DMA limit can be higher than any possible real pfn. - */ -int dma_pfn_limit_to_zone(u64 pfn_limit) -{ - int i; - - for (i = TOP_ZONE; i >= 0; i--) { - if (max_zone_pfns[i] <= pfn_limit) - return i; - } - - return -EPERM; -} - -/* * paging_init() sets up the page tables - in fact we've already done this. */ void __init paging_init(void) @@ -585,3 +530,9 @@ int devmem_is_allowed(unsigned long pfn) return 0; } #endif /* CONFIG_STRICT_DEVMEM */ + +/* + * This is defined in kernel/resource.c but only powerpc needs to export it, for + * the EHEA driver. Drop this when drivers/net/ethernet/ibm/ehea is removed. + */ +EXPORT_SYMBOL_GPL(walk_system_ram_range); diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index c4a717da65eb..74ff61dabcb1 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -130,7 +130,7 @@ extern void wii_memory_fixups(void); */ #ifdef CONFIG_PPC32 extern void MMU_init_hw(void); -extern unsigned long mmu_mapin_ram(unsigned long top); +unsigned long mmu_mapin_ram(unsigned long base, unsigned long top); #endif #ifdef CONFIG_PPC_FSL_BOOK3E @@ -165,3 +165,11 @@ unsigned long p_block_mapped(phys_addr_t pa); static inline phys_addr_t v_block_mapped(unsigned long va) { return 0; } static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif + +#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) +void mmu_mark_initmem_nx(void); +void mmu_mark_rodata_ro(void); +#else +static inline void mmu_mark_initmem_nx(void) { } +static inline void mmu_mark_rodata_ro(void) { } +#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index df1e11ebbabb..ac49e4158e50 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1460,13 +1460,6 @@ static void reset_topology_timer(void) #ifdef CONFIG_SMP -static void stage_topology_update(int core_id) -{ - cpumask_or(&cpu_associativity_changes_mask, - &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); - reset_topology_timer(); -} - static int dt_update_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -1479,7 +1472,7 @@ static int dt_update_callback(struct notifier_block *nb, !of_prop_cmp(update->prop->name, "ibm,associativity")) { u32 core_id; of_property_read_u32(update->dn, "reg", &core_id); - stage_topology_update(core_id); + rc = dlpar_cpu_readd(core_id); rc = NOTIFY_OK; } break; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index ded71126ce4c..6e56a6240bfa 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -254,26 +254,20 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) void __init mapin_ram(void) { - unsigned long s, top; - -#ifndef CONFIG_WII - top = total_lowmem; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); -#else - if (!wii_hole_size) { - s = mmu_mapin_ram(total_lowmem); - __mapin_ram_chunk(s, total_lowmem); - } else { - top = wii_hole_start; - s = mmu_mapin_ram(top); - __mapin_ram_chunk(s, top); - - top = memblock_end_of_DRAM(); - s = wii_mmu_mapin_mem2(top); - __mapin_ram_chunk(s, top); + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + phys_addr_t base = reg->base; + phys_addr_t top = min(base + reg->size, total_lowmem); + + if (base >= top) + continue; + base = mmu_mapin_ram(base, top); + if (IS_ENABLED(CONFIG_BDI_SWITCH)) + __mapin_ram_chunk(reg->base, top); + else + __mapin_ram_chunk(base, top); } -#endif } /* Scan the real Linux page tables and return a PTE pointer for @@ -359,7 +353,10 @@ void mark_initmem_nx(void) unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); - change_page_attr(page, numpages, PAGE_KERNEL); + if (v_block_mapped((unsigned long)_stext) + 1) + mmu_mark_initmem_nx(); + else + change_page_attr(page, numpages, PAGE_KERNEL); } #ifdef CONFIG_STRICT_KERNEL_RWX @@ -368,6 +365,11 @@ void mark_rodata_ro(void) struct page *page; unsigned long numpages; + if (v_block_mapped((unsigned long)_sinittext)) { + mmu_mark_rodata_ro(); + return; + } + page = virt_to_page(_stext); numpages = PFN_UP((unsigned long)_etext) - PFN_DOWN((unsigned long)_stext); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 36a664f06c65..6c8a60b1e31d 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -32,6 +32,7 @@ #include <asm/mmu.h> #include <asm/machdep.h> #include <asm/code-patching.h> +#include <asm/sections.h> #include "mmu_decl.h" @@ -73,45 +74,171 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -unsigned long __init mmu_mapin_ram(unsigned long top) +static int find_free_bat(void) { - unsigned long tot, bl, done; - unsigned long max_size = (256<<20); + int b; + + if (cpu_has_feature(CPU_FTR_601)) { + for (b = 0; b < 4; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[0].batl & 0x40)) + return b; + } + } else { + int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + + for (b = 0; b < n; b++) { + struct ppc_bat *bat = BATS[b]; + + if (!(bat[1].batu & 3)) + return b; + } + } + return -1; +} + +static unsigned int block_size(unsigned long base, unsigned long top) +{ + unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; + unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int block_shift = (fls(top - base) - 1) & 31; + + return min3(max_size, 1U << base_shift, 1U << block_shift); +} + +/* + * Set up one of the IBAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + * Only for 603+ ... + */ +static void setibat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl = (size >> 17) - 1; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if (!cpu_has_feature(CPU_FTR_NEED_COHERENT)) + flags &= ~_PAGE_COHERENT; + + wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); + bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[0].batu |= 1; /* Vp = 1 */ +} + +static void clearibat(int index) +{ + struct ppc_bat *bat = BATS[index]; + + bat[0].batu = 0; + bat[0].batl = 0; +} + +static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int idx; + + while ((idx = find_free_bat()) != -1 && base != top) { + unsigned int size = block_size(base, top); + + if (size < 128 << 10) + break; + setbat(idx, PAGE_OFFSET + base, base, size, PAGE_KERNEL_X); + base += size; + } + + return base; +} + +unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) +{ + int done; + unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { - printk(KERN_DEBUG "RAM mapped without BATs\n"); - return 0; + pr_debug("RAM mapped without BATs\n"); + return base; + } + + if (!strict_kernel_rwx_enabled() || base >= border || top <= border) + return __mmu_mapin_ram(base, top); + + done = __mmu_mapin_ram(base, border); + if (done != border - base) + return done; + + return done + __mmu_mapin_ram(border, top); +} + +void mmu_mark_initmem_nx(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + unsigned long base = (unsigned long)_stext - PAGE_OFFSET; + unsigned long top = (unsigned long)_etext - PAGE_OFFSET; + unsigned long size; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb - 1 && base < top && top - base > (128 << 10);) { + size = block_size(base, top); + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; } + if (base < top) { + size = block_size(base, top); + size = max(size, 128UL << 10); + if ((top - base) > size) { + if (strict_kernel_rwx_enabled()) + pr_warn("Kernel _etext not properly aligned\n"); + size <<= 1; + } + setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); + base += size; + } + for (; i < nb; i++) + clearibat(i); - /* Set up BAT2 and if necessary BAT3 to cover RAM. */ + update_bats(); - /* Make sure we don't map a block larger than the - smallest alignment of the physical address. */ - tot = top; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > tot) + for (i = TASK_SIZE >> 28; i < 16; i++) { + /* Do not set NX on VM space for modules */ + if (IS_ENABLED(CONFIG_MODULES) && + (VMALLOC_START & 0xf0000000) == i << 28) break; + mtsrin(mfsrin(i << 28) | 0x10000000, i << 28); } +} + +void mmu_mark_rodata_ro(void) +{ + int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; + int i; + + if (cpu_has_feature(CPU_FTR_601)) + return; + + for (i = 0; i < nb; i++) { + struct ppc_bat *bat = BATS[i]; - setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; - if ((done < tot) && !bat_addrs[3].limit) { - /* use BAT3 to cover a bit more */ - tot -= done; - for (bl = 128<<10; bl < max_size; bl <<= 1) - if (bl * 2 > tot) - break; - setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); - done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; + if (bat_addrs[i].start < (unsigned long)__init_begin) + bat[1].batl = (bat[1].batl & ~BPP_RW) | BPP_RX; } - return done; + update_bats(); } /* * Set up one of the I/D BAT (block address translation) register pairs. * The parameters are not checked; in particular size must be a power * of 2 between 128k and 256M. + * On 603+, only set IBAT when _PAGE_EXEC is set */ void __init setbat(int index, unsigned long virt, phys_addr_t phys, unsigned int size, pgprot_t prot) @@ -138,11 +265,12 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ - bat[0].batu = bat[0].batl = 0; - } else { - /* make IBAT same as DBAT */ - bat[0] = bat[1]; + flags &= ~_PAGE_EXEC; } + if (flags & _PAGE_EXEC) + bat[0] = bat[1]; + else + bat[0].batu = bat[0].batl = 0; } else { /* 601 cpu */ if (bl > BL_8M) @@ -230,7 +358,8 @@ void __init MMU_init_hw(void) if (lg_n_hpteg > 16) mb2 = 16 - LG_HPTEG_SIZE; - modify_instruction_site(&patch__hash_page_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__hash_page_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__hash_page_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__hash_page_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__hash_page_B, 0xffff, hmask); @@ -239,7 +368,8 @@ void __init MMU_init_hw(void) /* * Patch up the instructions in hashtable.S:flush_hash_page */ - modify_instruction_site(&patch__flush_hash_A0, 0xffff, (unsigned int)Hash >> 16); + modify_instruction_site(&patch__flush_hash_A0, 0xffff, + ((unsigned int)Hash - PAGE_OFFSET) >> 16); modify_instruction_site(&patch__flush_hash_A1, 0x7c0, mb << 6); modify_instruction_site(&patch__flush_hash_A2, 0x7c0, mb2 << 6); modify_instruction_site(&patch__flush_hash_B, 0xffff, hmask); diff --git a/arch/powerpc/mm/dump_linuxpagetables-8xx.c b/arch/powerpc/mm/ptdump/8xx.c index ab9e3f24db2f..9e2d8e847d6e 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <asm/pgtable.h> -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile new file mode 100644 index 000000000000..712762be3cb1 --- /dev/null +++ b/arch/powerpc/mm/ptdump/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += ptdump.o + +obj-$(CONFIG_4xx) += shared.o +obj-$(CONFIG_PPC_8xx) += 8xx.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o diff --git a/arch/powerpc/mm/dump_bats.c b/arch/powerpc/mm/ptdump/bats.c index a0d23e96e841..a0d23e96e841 100644 --- a/arch/powerpc/mm/dump_bats.c +++ b/arch/powerpc/mm/ptdump/bats.c diff --git a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index ed6fcf78256e..0dfca72cb9bd 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <asm/pgtable.h> -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index 869294695048..b430e4e08af6 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -342,7 +342,7 @@ static unsigned long hpte_find(struct pg_state *st, unsigned long ea, int psize) /* Look in secondary table */ if (slot == -1) - slot = base_hpte_find(ea, psize, true, &v, &r); + slot = base_hpte_find(ea, psize, false, &v, &r); /* No entry found */ if (slot == -1) diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/ptdump/ptdump.c index 6aa41669ac1a..37138428ab55 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -28,7 +28,7 @@ #include <asm/page.h> #include <asm/pgalloc.h> -#include "dump_linuxpagetables.h" +#include "ptdump.h" #ifdef CONFIG_PPC32 #define KERN_VIRT_START 0 @@ -143,14 +143,19 @@ static void dump_addr(struct pg_state *st, unsigned long addr) unsigned long delta; #ifdef CONFIG_PPC64 - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); - seq_printf(st->seq, "0x%016lx ", st->start_pa); +#define REG "0x%016lx" #else - seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); - seq_printf(st->seq, "0x%08lx ", st->start_pa); +#define REG "0x%08lx" #endif - delta = (addr - st->start_address) >> 10; + seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); + if (st->start_pa == st->last_pa && st->start_address + PAGE_SIZE != addr) { + seq_printf(st->seq, "[" REG "]", st->start_pa); + delta = PAGE_SIZE >> 10; + } else { + seq_printf(st->seq, " " REG " ", st->start_pa); + delta = (addr - st->start_address) >> 10; + } /* Work out what appropriate unit to use */ while (!(delta & 1023) && unit[1]) { delta >>= 10; @@ -184,7 +189,8 @@ static void note_page(struct pg_state *st, unsigned long addr, */ } else if (flag != st->current_flags || level != st->level || addr >= st->marker[1].start_address || - pa != st->last_pa + PAGE_SIZE) { + (pa != st->last_pa + PAGE_SIZE && + (pa != st->start_pa || st->start_pa != st->last_pa))) { /* Check the PTE flags */ if (st->current_flags) { diff --git a/arch/powerpc/mm/dump_linuxpagetables.h b/arch/powerpc/mm/ptdump/ptdump.h index 5d513636de73..5d513636de73 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.h +++ b/arch/powerpc/mm/ptdump/ptdump.h diff --git a/arch/powerpc/mm/dump_sr.c b/arch/powerpc/mm/ptdump/segment_regs.c index 501843664bb9..501843664bb9 100644 --- a/arch/powerpc/mm/dump_sr.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c diff --git a/arch/powerpc/mm/dump_linuxpagetables-generic.c b/arch/powerpc/mm/ptdump/shared.c index 3fe98a0974c6..f7ed2f187cb0 100644 --- a/arch/powerpc/mm/dump_linuxpagetables-generic.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <asm/pgtable.h> -#include "dump_linuxpagetables.h" +#include "ptdump.h" static const struct flag_info flag_array[] = { { diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index bc3914d54e26..5986df48359b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -69,6 +69,11 @@ static void assert_slb_presence(bool present, unsigned long ea) if (!cpu_has_feature(CPU_FTR_ARCH_206)) return; + /* + * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware + * ignores all other bits from 0-27, so just clear them all. + */ + ea &= ~((1UL << 28) - 1); asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0"); WARN_ON(present == (tmp == 0)); diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 06898c13901d..aec91dbcdc0b 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -32,6 +32,7 @@ #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/sched/mm.h> +#include <linux/security.h> #include <asm/mman.h> #include <asm/mmu.h> #include <asm/copro.h> @@ -377,6 +378,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, prev; struct vm_unmapped_area_info info; + unsigned long min_addr = max(PAGE_SIZE, mmap_min_addr); info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; @@ -393,7 +395,7 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, if (high_limit > DEFAULT_MAP_WINDOW) addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; - while (addr > PAGE_SIZE) { + while (addr > min_addr) { info.high_limit = addr; if (!slice_scan_available(addr - 1, available, 0, &addr)) continue; @@ -405,8 +407,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * Check if we need to reduce the range, or if we can * extend it to cover the previous available slice. */ - if (addr < PAGE_SIZE) - addr = PAGE_SIZE; + if (addr < min_addr) + addr = min_addr; else if (slice_scan_available(addr - 1, available, 0, &prev)) { addr = prev; goto prev_slice; @@ -528,7 +530,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, addr = _ALIGN_UP(addr, page_size); slice_dbg(" aligned addr=%lx\n", addr); /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > high_limit - len || + if (addr > high_limit - len || addr < mmap_min_addr || !slice_area_is_free(mm, addr, len)) addr = 0; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ae5d568e267f..ac23dc1c6535 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -302,7 +302,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, * This function as well as __local_flush_tlb_page() must only be called * for user contexts. */ - if (unlikely(WARN_ON(!mm))) + if (WARN_ON(!mm)) return; preempt_disable(); diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h index 6f4daacad296..dc50a8d4b3b9 100644 --- a/arch/powerpc/net/bpf_jit32.h +++ b/arch/powerpc/net/bpf_jit32.h @@ -106,9 +106,8 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh); } while (0) #else #define PPC_BPF_LOAD_CPU(r) \ - do { BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); \ - PPC_LHZ_OFFS(r, (1 & ~(THREAD_SIZE - 1)), \ - offsetof(struct thread_info, cpu)); \ + do { BUILD_BUG_ON(FIELD_SIZEOF(struct task_struct, cpu) != 4); \ + PPC_LHZ_OFFS(r, 2, offsetof(struct task_struct, cpu)); \ } while(0) #endif #else diff --git a/arch/powerpc/perf/power9-events-list.h b/arch/powerpc/perf/power9-events-list.h index 7de344b7d9cc..063c9d9f2516 100644 --- a/arch/powerpc/perf/power9-events-list.h +++ b/arch/powerpc/perf/power9-events-list.h @@ -97,3 +97,27 @@ EVENT(PM_MRK_DTLB_MISS_64K, 0x3d156) EVENT(PM_DTLB_MISS_16M, 0x4c056) EVENT(PM_DTLB_MISS_1G, 0x4c05a) EVENT(PM_MRK_DTLB_MISS_16M, 0x4c15e) + +/* + * Memory Access Events + * + * Primary PMU event used here is PM_MRK_INST_CMPL (0x401e0) + * To enable capturing of memory profiling, these MMCRA bits + * needs to be programmed and corresponding raw event format + * encoding. + * + * MMCRA bits encoding needed are + * SM (Sampling Mode) + * EM (Eligibility for Random Sampling) + * TECE (Threshold Event Counter Event) + * TS (Threshold Start Event) + * TE (Threshold End Event) + * + * Corresponding Raw Encoding bits: + * sample [EM,SM] + * thresh_sel (TECE) + * thresh start (TS) + * thresh end (TE) + */ +EVENT(MEM_LOADS, 0x34340401e0) +EVENT(MEM_STORES, 0x343c0401e0) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 0ff9c43733e9..030544e35959 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -160,6 +160,8 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BR_CMPL); GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1_FIN); +GENERIC_EVENT_ATTR(mem-loads, MEM_LOADS); +GENERIC_EVENT_ATTR(mem-stores, MEM_STORES); CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1_FIN); CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); @@ -185,6 +187,8 @@ static struct attribute *power9_events_attr[] = { GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), GENERIC_EVENT_PTR(PM_LD_REF_L1), GENERIC_EVENT_PTR(PM_LD_MISS_L1_FIN), + GENERIC_EVENT_PTR(MEM_LOADS), + GENERIC_EVENT_PTR(MEM_STORES), CACHE_EVENT_PTR(PM_LD_MISS_L1_FIN), CACHE_EVENT_PTR(PM_LD_REF_L1), CACHE_EVENT_PTR(PM_L1_PREF), diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 4a9a72d01c3c..35be81fd2dc2 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -180,6 +180,7 @@ config CURRITUCK depends on PPC_47x select SWIOTLB select 476FPE + select FORCE_PCI select PPC4xx_PCI_EXPRESS help This option enables support for the IBM Currituck (476fpe) evaluation board diff --git a/arch/powerpc/platforms/44x/ppc476.c b/arch/powerpc/platforms/44x/ppc476.c index e55933f9cd55..a5e61e5c16e2 100644 --- a/arch/powerpc/platforms/44x/ppc476.c +++ b/arch/powerpc/platforms/44x/ppc476.c @@ -34,6 +34,7 @@ #include <asm/ppc4xx.h> #include <asm/mpic.h> #include <asm/mmu.h> +#include <asm/swiotlb.h> #include <linux/pci.h> #include <linux/i2c.h> diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index f467247fd1c4..18422dbd061a 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -47,7 +47,7 @@ static int __init warp_probe(void) if (!of_machine_is_compatible("pika,warp")) return 0; - /* For __dma_nommu_alloc_coherent */ + /* For arch_dma_alloc */ ISA_DMA_THRESHOLD = ~0L; return 1; diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S index 3d1ecd211776..8137f77abad5 100644 --- a/arch/powerpc/platforms/83xx/suspend-asm.S +++ b/arch/powerpc/platforms/83xx/suspend-asm.S @@ -26,13 +26,13 @@ #define SS_MSR 0x74 #define SS_SDR1 0x78 #define SS_LR 0x7c -#define SS_SPRG 0x80 /* 4 SPRGs */ -#define SS_DBAT 0x90 /* 8 DBATs */ -#define SS_IBAT 0xd0 /* 8 IBATs */ -#define SS_TB 0x110 -#define SS_CR 0x118 -#define SS_GPREG 0x11c /* r12-r31 */ -#define STATE_SAVE_SIZE 0x16c +#define SS_SPRG 0x80 /* 8 SPRGs */ +#define SS_DBAT 0xa0 /* 8 DBATs */ +#define SS_IBAT 0xe0 /* 8 IBATs */ +#define SS_TB 0x120 +#define SS_CR 0x128 +#define SS_GPREG 0x12c /* r12-r31 */ +#define STATE_SAVE_SIZE 0x17c .section .data .align 5 @@ -103,6 +103,16 @@ _GLOBAL(mpc83xx_enter_deep_sleep) stw r7, SS_SPRG+12(r3) stw r8, SS_SDR1(r3) + mfspr r4, SPRN_SPRG4 + mfspr r5, SPRN_SPRG5 + mfspr r6, SPRN_SPRG6 + mfspr r7, SPRN_SPRG7 + + stw r4, SS_SPRG+16(r3) + stw r5, SS_SPRG+20(r3) + stw r6, SS_SPRG+24(r3) + stw r7, SS_SPRG+28(r3) + mfspr r4, SPRN_DBAT0U mfspr r5, SPRN_DBAT0L mfspr r6, SPRN_DBAT1U @@ -493,6 +503,16 @@ mpc83xx_deep_resume: mtspr SPRN_IBAT7U, r6 mtspr SPRN_IBAT7L, r7 + lwz r4, SS_SPRG+16(r3) + lwz r5, SS_SPRG+20(r3) + lwz r6, SS_SPRG+24(r3) + lwz r7, SS_SPRG+28(r3) + + mtspr SPRN_SPRG4, r4 + mtspr SPRN_SPRG5, r5 + mtspr SPRN_SPRG6, r6 + mtspr SPRN_SPRG7, r7 + lwz r4, SS_SPRG+0(r3) lwz r5, SS_SPRG+4(r3) lwz r6, SS_SPRG+8(r3) diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index b0dac307bebf..785e9641220d 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -27,6 +27,7 @@ #include <asm/udbg.h> #include <asm/mpic.h> #include <asm/ehv_pic.h> +#include <asm/swiotlb.h> #include <soc/fsl/qe/qe_ic.h> #include <linux/of_platform.h> @@ -223,7 +224,3 @@ define_machine(corenet_generic) { }; machine_arch_initcall(corenet_generic, corenet_gen_publish_devices); - -#ifdef CONFIG_SWIOTLB -machine_arch_initcall(corenet_generic, swiotlb_setup_bus_notifier); -#endif diff --git a/arch/powerpc/platforms/85xx/ge_imp3a.c b/arch/powerpc/platforms/85xx/ge_imp3a.c index f29c6f0909f3..c64fa2483ea9 100644 --- a/arch/powerpc/platforms/85xx/ge_imp3a.c +++ b/arch/powerpc/platforms/85xx/ge_imp3a.c @@ -202,8 +202,6 @@ static int __init ge_imp3a_probe(void) machine_arch_initcall(ge_imp3a, mpc85xx_common_publish_devices); -machine_arch_initcall(ge_imp3a, swiotlb_setup_bus_notifier); - define_machine(ge_imp3a) { .name = "GE_IMP3A", .probe = ge_imp3a_probe, diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index 94a7f92c858f..94194bad4954 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -57,8 +57,6 @@ static void __init mpc8536_ds_setup_arch(void) machine_arch_initcall(mpc8536_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8536_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index dc9e035cc637..b7e29ce1f266 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -174,10 +174,6 @@ machine_arch_initcall(mpc8544_ds, mpc85xx_common_publish_devices); machine_arch_initcall(mpc8572_ds, mpc85xx_common_publish_devices); machine_arch_initcall(p2020_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8544_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8572_ds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p2020_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index d7e440e6dba3..80939a425de5 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -367,10 +367,6 @@ machine_arch_initcall(mpc8568_mds, mpc85xx_publish_devices); machine_arch_initcall(mpc8569_mds, mpc85xx_publish_devices); machine_arch_initcall(p1021_mds, mpc85xx_common_publish_devices); -machine_arch_initcall(mpc8568_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(mpc8569_mds, swiotlb_setup_bus_notifier); -machine_arch_initcall(p1021_mds, swiotlb_setup_bus_notifier); - static void __init mpc85xx_mds_pic_init(void) { struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN | diff --git a/arch/powerpc/platforms/85xx/p1010rdb.c b/arch/powerpc/platforms/85xx/p1010rdb.c index 78d13b364cd6..33ca373322e1 100644 --- a/arch/powerpc/platforms/85xx/p1010rdb.c +++ b/arch/powerpc/platforms/85xx/p1010rdb.c @@ -55,7 +55,6 @@ static void __init p1010_rdb_setup_arch(void) } machine_arch_initcall(p1010_rdb, mpc85xx_common_publish_devices); -machine_arch_initcall(p1010_rdb, swiotlb_setup_bus_notifier); /* * Called very early, device-tree isn't unflattened diff --git a/arch/powerpc/platforms/85xx/p1022_ds.c b/arch/powerpc/platforms/85xx/p1022_ds.c index 9fb57f78cdbe..1f1af0557470 100644 --- a/arch/powerpc/platforms/85xx/p1022_ds.c +++ b/arch/powerpc/platforms/85xx/p1022_ds.c @@ -548,8 +548,6 @@ static void __init p1022_ds_setup_arch(void) machine_arch_initcall(p1022_ds, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_ds, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c index 276e00ab3dde..fd9e3e7ef234 100644 --- a/arch/powerpc/platforms/85xx/p1022_rdk.c +++ b/arch/powerpc/platforms/85xx/p1022_rdk.c @@ -128,8 +128,6 @@ static void __init p1022_rdk_setup_arch(void) machine_arch_initcall(p1022_rdk, mpc85xx_common_publish_devices); -machine_arch_initcall(p1022_rdk, swiotlb_setup_bus_notifier); - /* * Called very early, device-tree isn't unflattened */ diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c b/arch/powerpc/platforms/85xx/qemu_e500.c index 27631c607f3d..c52c8f9e8385 100644 --- a/arch/powerpc/platforms/85xx/qemu_e500.c +++ b/arch/powerpc/platforms/85xx/qemu_e500.c @@ -22,6 +22,7 @@ #include <asm/time.h> #include <asm/udbg.h> #include <asm/mpic.h> +#include <asm/swiotlb.h> #include <sysdev/fsl_soc.h> #include <sysdev/fsl_pci.h> #include "smp.h" diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 17c6cd3d02e6..775a92353c83 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -121,7 +121,6 @@ static int __init declare_of_platform_devices(void) return 0; } machine_arch_initcall(mpc86xx_hpcn, declare_of_platform_devices); -machine_arch_initcall(mpc86xx_hpcn, swiotlb_setup_bus_notifier); define_machine(mpc86xx_hpcn) { .name = "MPC86xx HPCN", diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8c7464c3f27f..842b2c7e156a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -153,6 +153,11 @@ config E300C3_CPU bool "e300c3 (831x)" depends on PPC_BOOK3S_32 +config G4_CPU + bool "G4 (74xx)" + depends on PPC_BOOK3S_32 + select ALTIVEC + endchoice config TARGET_CPU_BOOL @@ -171,6 +176,7 @@ config TARGET_CPU default "860" if 860_CPU default "e300c2" if E300C2_CPU default "e300c3" if E300C3_CPU + default "G4" if G4_CPU config PPC_BOOK3S def_bool y @@ -402,6 +408,9 @@ config NOT_COHERENT_CACHE bool depends on 4xx || PPC_8xx || E200 || PPC_MPC512x || \ GAMECUBE_COMMON || AMIGAONE + select ARCH_HAS_DMA_COHERENT_TO_PFN + select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_SYNC_DMA_FOR_CPU default n if PPC_47x default y diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index af2a3c15e0ec..54e012e1f720 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -544,9 +544,10 @@ static struct cbe_iommu *cell_iommu_for_node(int nid) static unsigned long cell_dma_nommu_offset; static unsigned long dma_iommu_fixed_base; +static bool cell_iommu_enabled; /* iommu_fixed_is_weak is set if booted with iommu_fixed=weak */ -static int iommu_fixed_is_weak; +bool iommu_fixed_is_weak; static struct iommu_table *cell_get_iommu_table(struct device *dev) { @@ -568,102 +569,19 @@ static struct iommu_table *cell_get_iommu_table(struct device *dev) return &window->table; } -/* A coherent allocation implies strong ordering */ - -static void *dma_fixed_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - return iommu_alloc_coherent(dev, cell_get_iommu_table(dev), - size, dma_handle, - device_to_mask(dev), flag, - dev_to_node(dev)); - else - return dma_nommu_ops.alloc(dev, size, dma_handle, flag, - attrs); -} - -static void dma_fixed_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - if (iommu_fixed_is_weak) - iommu_free_coherent(cell_get_iommu_table(dev), size, vaddr, - dma_handle); - else - dma_nommu_ops.free(dev, size, vaddr, dma_handle, attrs); -} - -static dma_addr_t dma_fixed_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_page(dev, page, offset, size, - direction, attrs); - else - return iommu_map_page(dev, cell_get_iommu_table(dev), page, - offset, size, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_page(dev, dma_addr, size, direction, - attrs); - else - iommu_unmap_page(cell_get_iommu_table(dev), dma_addr, size, - direction, attrs); -} - -static int dma_fixed_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - return dma_nommu_ops.map_sg(dev, sg, nents, direction, attrs); - else - return ppc_iommu_map_sg(dev, cell_get_iommu_table(dev), sg, - nents, device_to_mask(dev), - direction, attrs); -} - -static void dma_fixed_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - unsigned long attrs) -{ - if (iommu_fixed_is_weak == (attrs & DMA_ATTR_WEAK_ORDERING)) - dma_nommu_ops.unmap_sg(dev, sg, nents, direction, attrs); - else - ppc_iommu_unmap_sg(cell_get_iommu_table(dev), sg, nents, - direction, attrs); -} - -static int dma_suported_and_switch(struct device *dev, u64 dma_mask); - -static const struct dma_map_ops dma_iommu_fixed_ops = { - .alloc = dma_fixed_alloc_coherent, - .free = dma_fixed_free_coherent, - .map_sg = dma_fixed_map_sg, - .unmap_sg = dma_fixed_unmap_sg, - .dma_supported = dma_suported_and_switch, - .map_page = dma_fixed_map_page, - .unmap_page = dma_fixed_unmap_page, -}; +static u64 cell_iommu_get_fixed_address(struct device *dev); static void cell_dma_dev_setup(struct device *dev) { - if (get_pci_dma_ops() == &dma_iommu_ops) + if (cell_iommu_enabled) { + u64 addr = cell_iommu_get_fixed_address(dev); + + if (addr != OF_BAD_ADDR) + dev->archdata.dma_offset = addr + dma_iommu_fixed_base; set_iommu_table_base(dev, cell_get_iommu_table(dev)); - else if (get_pci_dma_ops() == &dma_nommu_ops) - set_dma_offset(dev, cell_dma_nommu_offset); - else - BUG(); + } else { + dev->archdata.dma_offset = cell_dma_nommu_offset; + } } static void cell_pci_dma_dev_setup(struct pci_dev *dev) @@ -680,11 +598,9 @@ static int cell_of_bus_notify(struct notifier_block *nb, unsigned long action, if (action != BUS_NOTIFY_ADD_DEVICE) return 0; - /* We use the PCI DMA ops */ - dev->dma_ops = get_pci_dma_ops(); - + if (cell_iommu_enabled) + dev->dma_ops = &dma_iommu_ops; cell_dma_dev_setup(dev); - return 0; } @@ -809,7 +725,6 @@ static int __init cell_iommu_init_disabled(void) unsigned long base = 0, size; /* When no iommu is present, we use direct DMA ops */ - set_pci_dma_ops(&dma_nommu_ops); /* First make sure all IOC translation is turned off */ cell_disable_iommus(); @@ -894,7 +809,11 @@ static u64 cell_iommu_get_fixed_address(struct device *dev) const u32 *ranges = NULL; int i, len, best, naddr, nsize, pna, range_size; + /* We can be called for platform devices that have no of_node */ np = of_node_get(dev->of_node); + if (!np) + goto out; + while (1) { naddr = of_n_addr_cells(np); nsize = of_n_size_cells(np); @@ -945,27 +864,10 @@ out: return dev_addr; } -static int dma_suported_and_switch(struct device *dev, u64 dma_mask) +static bool cell_pci_iommu_bypass_supported(struct pci_dev *pdev, u64 mask) { - if (dma_mask == DMA_BIT_MASK(64) && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) { - u64 addr = cell_iommu_get_fixed_address(dev) + - dma_iommu_fixed_base; - dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); - dev_dbg(dev, "iommu: fixed addr = %llx\n", addr); - set_dma_ops(dev, &dma_iommu_fixed_ops); - set_dma_offset(dev, addr); - return 1; - } - - if (dma_iommu_dma_supported(dev, dma_mask)) { - dev_dbg(dev, "iommu: not 64-bit, using default ops\n"); - set_dma_ops(dev, get_pci_dma_ops()); - cell_dma_dev_setup(dev); - return 1; - } - - return 0; + return mask == DMA_BIT_MASK(64) && + cell_iommu_get_fixed_address(&pdev->dev) != OF_BAD_ADDR; } static void insert_16M_pte(unsigned long addr, unsigned long *ptab, @@ -1119,9 +1021,8 @@ static int __init cell_iommu_fixed_mapping_init(void) cell_iommu_setup_window(iommu, np, dbase, dsize, 0); } - dma_iommu_ops.dma_supported = dma_suported_and_switch; - set_pci_dma_ops(&dma_iommu_ops); - + cell_pci_controller_ops.iommu_bypass_supported = + cell_pci_iommu_bypass_supported; return 0; } @@ -1142,7 +1043,7 @@ static int __init setup_iommu_fixed(char *str) pciep = of_find_node_by_type(NULL, "pcie-endpoint"); if (strcmp(str, "weak") == 0 || (pciep && strcmp(str, "strong") != 0)) - iommu_fixed_is_weak = DMA_ATTR_WEAK_ORDERING; + iommu_fixed_is_weak = true; of_node_put(pciep); @@ -1150,26 +1051,6 @@ static int __init setup_iommu_fixed(char *str) } __setup("iommu_fixed=", setup_iommu_fixed); -static u64 cell_dma_get_required_mask(struct device *dev) -{ - const struct dma_map_ops *dma_ops; - - if (!dev->dma_mask) - return 0; - - if (!iommu_fixed_disabled && - cell_iommu_get_fixed_address(dev) != OF_BAD_ADDR) - return DMA_BIT_MASK(64); - - dma_ops = get_dma_ops(dev); - if (dma_ops->get_required_mask) - return dma_ops->get_required_mask(dev); - - WARN_ONCE(1, "no get_required_mask in %p ops", dma_ops); - - return DMA_BIT_MASK(64); -} - static int __init cell_iommu_init(void) { struct device_node *np; @@ -1186,10 +1067,9 @@ static int __init cell_iommu_init(void) /* Setup various callbacks */ cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; - ppc_md.dma_get_required_mask = cell_dma_get_required_mask; if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) - goto bail; + goto done; /* Create an iommu for each /axon node. */ for_each_node_by_name(np, "axon") { @@ -1206,10 +1086,10 @@ static int __init cell_iommu_init(void) continue; cell_iommu_init_one(np, SPIDER_DMA_OFFSET); } - + done: /* Setup default PCI iommu ops */ set_pci_dma_ops(&dma_iommu_ops); - + cell_iommu_enabled = true; bail: /* Register callbacks on OF platform device addition/removal * to handle linking them to the right DMA operations diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index 125f2a5f02de..b5f35cbe9e21 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -34,7 +34,7 @@ */ static void *spu_syscall_table[] = { -#define __SYSCALL(nr, entry, nargs) entry, +#define __SYSCALL(nr, entry) entry, #include <asm/syscall_table_spu.h> #undef __SYSCALL }; diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c index 263413a34823..b95d6afc39b5 100644 --- a/arch/powerpc/platforms/cell/spu_syscalls.c +++ b/arch/powerpc/platforms/cell/spu_syscalls.c @@ -26,7 +26,6 @@ #include <linux/syscalls.h> #include <linux/rcupdate.h> #include <linux/binfmts.h> -#include <linux/syscalls.h> #include <asm/spu.h> diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index ae8123edddc6..48c2477e7e2a 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -2338,9 +2338,8 @@ static int spufs_switch_log_open(struct inode *inode, struct file *file) goto out; } - ctx->switch_log = kmalloc(sizeof(struct switch_log) + - SWITCH_LOG_BUFSIZE * sizeof(struct switch_log_entry), - GFP_KERNEL); + ctx->switch_log = kmalloc(struct_size(ctx->switch_log, log, + SWITCH_LOG_BUFSIZE), GFP_KERNEL); if (!ctx->switch_log) { rc = -ENOMEM; diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index ecf703ee3a76..235fe81aa2b1 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -54,10 +54,6 @@ static void __iomem *hw_ctrl; static void __iomem *hw_gpio; -unsigned long wii_hole_start; -unsigned long wii_hole_size; - - static int __init page_aligned(unsigned long x) { return !(x & (PAGE_SIZE-1)); @@ -69,26 +65,6 @@ void __init wii_memory_fixups(void) BUG_ON(memblock.memory.cnt != 2); BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); - - /* determine hole */ - wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE); - wii_hole_size = p[1].base - wii_hole_start; -} - -unsigned long __init wii_mmu_mapin_mem2(unsigned long top) -{ - unsigned long delta, size, bl; - unsigned long max_size = (256<<20); - - /* MEM2 64MB@0x10000000 */ - delta = wii_hole_start + wii_hole_size; - size = top - delta; - for (bl = 128<<10; bl < max_size; bl <<= 1) { - if (bl * 2 > size) - break; - } - setbat(4, PAGE_OFFSET+delta, delta, bl, PAGE_KERNEL_X); - return delta + bl; } static void __noreturn wii_spin(void) diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index f62930f839ca..86368e238f6e 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -186,7 +186,7 @@ static void pci_dma_dev_setup_pasemi(struct pci_dev *dev) */ if (dev->vendor == 0x1959 && dev->device == 0xa007 && !firmware_has_feature(FW_FEATURE_LPAR)) { - dev->dev.dma_ops = &dma_nommu_ops; + dev->dev.dma_ops = NULL; /* * Set the coherent DMA mask to prevent the iommu * being used unnecessarily diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c index c0532999f854..46dd463faaa7 100644 --- a/arch/powerpc/platforms/pasemi/setup.c +++ b/arch/powerpc/platforms/pasemi/setup.c @@ -411,55 +411,6 @@ out: return !!(srr1 & 0x2); } -#ifdef CONFIG_PCMCIA -static int pcmcia_notify(struct notifier_block *nb, unsigned long action, - void *data) -{ - struct device *dev = data; - struct device *parent; - struct pcmcia_device *pdev = to_pcmcia_dev(dev); - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - parent = pdev->socket->dev.parent; - - /* We know electra_cf devices will always have of_node set, since - * electra_cf is an of_platform driver. - */ - if (!parent->of_node) - return 0; - - if (!of_device_is_compatible(parent->of_node, "electra-cf")) - return 0; - - /* We use the direct ops for localbus */ - dev->dma_ops = &dma_nommu_ops; - - return 0; -} - -static struct notifier_block pcmcia_notifier = { - .notifier_call = pcmcia_notify, -}; - -static inline void pasemi_pcmcia_init(void) -{ - extern struct bus_type pcmcia_bus_type; - - bus_register_notifier(&pcmcia_bus_type, &pcmcia_notifier); -} - -#else - -static inline void pasemi_pcmcia_init(void) -{ -} - -#endif - - static const struct of_device_id pasemi_bus_ids[] = { /* Unfortunately needed for legacy firmwares */ { .type = "localbus", }, @@ -472,8 +423,6 @@ static const struct of_device_id pasemi_bus_ids[] = { static int __init pasemi_publish_devices(void) { - pasemi_pcmcia_init(); - /* Publish OF platform devices for SDC and other non-PCI devices */ of_platform_bus_probe(NULL, pasemi_bus_ids, NULL); diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b540ce8eec55..da2e99efbd04 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o -obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o +obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o +obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o @@ -11,7 +11,6 @@ obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o -obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 35f699ebb662..e52f9b06dd9c 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -458,7 +458,8 @@ EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_HOTPLUG_CPU -static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) + +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) { u64 pir = get_hard_smp_processor_id(cpu); @@ -481,20 +482,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); - u64 lpcr_val; - - /* - * We don't want to take decrementer interrupts while we are - * offline, so clear LPCR:PECE1. We keep PECE2 (and - * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. - * - * If the CPU gets woken up by a special wakeup, ensure that - * the SLW engine sets LPCR with decrementer bit cleared, else - * the CPU will come back to the kernel due to a spurious - * wakeup. - */ - lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -526,16 +513,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); - /* - * Re-enable decrementer interrupts in LPCR. - * - * Further, we want stop states to be woken up by decrementer - * for non-hotplug cases. So program the LPCR via stop api as - * well. - */ - lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); - return srr1; } #endif diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 3f58c7dbd581..dc23d9d2a7d9 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -28,10 +28,6 @@ */ static DEFINE_SPINLOCK(npu_context_lock); -/* - * Other types of TCE cache invalidation are not functional in the - * hardware. - */ static struct pci_dev *get_pci_dev(struct device_node *dn) { struct pci_dn *pdn = PCI_DN(dn); @@ -220,7 +216,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) * their parent device so drivers shouldn't be doing DMA * operations directly on these devices. */ - set_dma_ops(&npe->pdev->dev, NULL); + set_dma_ops(&npe->pdev->dev, &dma_dummy_ops); } /* @@ -917,15 +913,6 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, mmio_invalidate(npu_context, 0, ~0UL); } -static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, address, PAGE_SIZE); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -936,7 +923,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, - .change_pte = pnv_npu2_mn_change_pte, .invalidate_range = pnv_npu2_mn_invalidate_range, }; diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c new file mode 100644 index 000000000000..578757d403ab --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/percpu.h> +#include <linux/jump_label.h> +#include <asm/opal-api.h> +#include <asm/trace.h> +#include <asm/asm-prototypes.h> + +#ifdef CONFIG_TRACEPOINTS +/* + * Since the tracing code might execute OPAL calls we need to guard against + * recursion. + */ +static DEFINE_PER_CPU(unsigned int, opal_trace_depth); + +static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode) +{ + unsigned int *depth; + unsigned long args[8]; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + args[0] = a0; + args[1] = a1; + args[2] = a2; + args[3] = a3; + args[4] = a4; + args[5] = a5; + args[6] = a6; + args[7] = a7; + + (*depth)++; + trace_opal_entry(opcode, &args[0]); + (*depth)--; +} + +static void __trace_opal_exit(unsigned long opcode, unsigned long retval) +{ + unsigned int *depth; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + (*depth)++; + trace_opal_exit(opcode, retval); + (*depth)--; +} + +static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key); + +int opal_tracepoint_regfunc(void) +{ + static_branch_inc(&opal_tracepoint_key); + return 0; +} + +void opal_tracepoint_unregfunc(void) +{ + static_branch_dec(&opal_tracepoint_key); +} + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ + s64 ret; + + __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode); + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + __trace_opal_exit(opcode, ret); + + return ret; +} + +#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key)) + +#else /* CONFIG_TRACEPOINTS */ + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ +} + +#define DO_TRACE false +#endif /* CONFIG_TRACEPOINTS */ + +static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode) +{ + unsigned long flags; + unsigned long msr = mfmsr(); + bool mmu = (msr & (MSR_IR|MSR_DR)); + int64_t ret; + + msr &= ~MSR_EE; + + if (unlikely(!mmu)) + return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + + local_save_flags(flags); + hard_irq_disable(); + + if (DO_TRACE) { + ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } else { + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } + + local_irq_restore(flags); + + return ret; +} + +#define OPAL_CALL(name, opcode) \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7) \ +{ \ + return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \ +} + +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); +OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); +OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); +OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); +OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); +OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); +OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); +OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); +OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); +OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); +OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); +OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); +OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); +OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); +OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); +OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); +OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); +OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); +OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); +OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); +OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); +OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); +OPAL_CALL(opal_quiesce, OPAL_QUIESCE); +OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); +OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); +OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); +OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); +OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); +OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index acd3206dfae3..06628c71cef6 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -98,7 +98,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, } static struct bin_attribute opal_msglog_attr = { - .attr = {.name = "msglog", .mode = 0444}, + .attr = {.name = "msglog", .mode = 0400}, .read = opal_msglog_read }; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f4875fe3f8ff..7d2052d8af9d 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -17,317 +17,51 @@ #include <asm/asm-compat.h> #include <asm/feature-fixups.h> - .section ".text" - -#ifdef CONFIG_TRACEPOINTS -#ifdef CONFIG_JUMP_LABEL -#define OPAL_BRANCH(LABEL) \ - ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) -#else - - .section ".toc","aw" - - .globl opal_tracepoint_refcount -opal_tracepoint_refcount: - .8byte 0 - - .section ".text" - -/* - * We branch around this in early init by using an unconditional cpu - * feature. - */ -#define OPAL_BRANCH(LABEL) \ -BEGIN_FTR_SECTION; \ - b 1f; \ -END_FTR_SECTION(0, 1); \ - ld r11,opal_tracepoint_refcount@toc(r2); \ - cmpdi r11,0; \ - bne- LABEL; \ -1: - -#endif - -#else -#define OPAL_BRANCH(LABEL) -#endif + .section ".text" /* - * DO_OPAL_CALL assumes: - * r0 = opal call token - * r12 = msr - * LR has been saved + * r3-r10 - OPAL call arguments + * STK_PARAM(R11) - OPAL opcode + * STK_PARAM(R12) - MSR to restore */ -#define DO_OPAL_CALL() \ - mfcr r11; \ - stw r11,8(r1); \ - li r11,0; \ - ori r11,r11,MSR_EE; \ - std r12,PACASAVEDMSR(r13); \ - andc r12,r12,r11; \ - mtmsrd r12,1; \ - LOAD_REG_ADDR(r11,opal_return); \ - mtlr r11; \ - li r11,MSR_DR|MSR_IR|MSR_LE;\ - andc r12,r12,r11; \ - mtspr SPRN_HSRR1,r12; \ - LOAD_REG_ADDR(r11,opal); \ - ld r12,8(r11); \ - ld r2,0(r11); \ - mtspr SPRN_HSRR0,r12; \ +_GLOBAL_TOC(__opal_call) + mflr r0 + std r0,PPC_LR_STKOFF(r1) + ld r12,STK_PARAM(R12)(r1) + li r0,MSR_IR|MSR_DR|MSR_LE + andc r12,r12,r0 + LOAD_REG_ADDR(r11, opal_return) + mtlr r11 + LOAD_REG_ADDR(r11, opal) + ld r2,0(r11) + ld r11,8(r11) + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 + /* set token to r0 */ + ld r0,STK_PARAM(R11)(r1) hrfid - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ - DO_OPAL_CALL() - - opal_return: /* - * Fixup endian on OPAL return... we should be able to simplify - * this by instead converting the below trampoline to a set of - * bytes (always BE) since MSR:LE will end up fixed up as a side - * effect of the rfid. + * Restore MSR on OPAL return. The MSR is set to big-endian. */ - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r4,8(r1); - ld r5,PPC_LR_STKOFF(r1); - ld r6,PACASAVEDMSR(r13); - mtcr r4; - mtspr SPRN_HSRR0,r5; - mtspr SPRN_HSRR1,r6; - hrfid - -opal_real_call: - mfcr r11 - stw r11,8(r1) - /* Set opal return address */ - LOAD_REG_ADDR(r11, opal_return_realmode) - mtlr r11 - li r11,MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid - -opal_return_realmode: - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r11,8(r1); - ld r12,PPC_LR_STKOFF(r1) - mtcr r11; - mtlr r12 - blr - -#ifdef CONFIG_TRACEPOINTS -opal_tracepoint_entry: - stdu r1,-STACKFRAMESIZE(r1) - std r0,STK_REG(R23)(r1) - std r3,STK_REG(R24)(r1) - std r4,STK_REG(R25)(r1) - std r5,STK_REG(R26)(r1) - std r6,STK_REG(R27)(r1) - std r7,STK_REG(R28)(r1) - std r8,STK_REG(R29)(r1) - std r9,STK_REG(R30)(r1) - std r10,STK_REG(R31)(r1) - mr r3,r0 - addi r4,r1,STK_REG(R24) - bl __trace_opal_entry - ld r0,STK_REG(R23)(r1) - ld r3,STK_REG(R24)(r1) - ld r4,STK_REG(R25)(r1) - ld r5,STK_REG(R26)(r1) - ld r6,STK_REG(R27)(r1) - ld r7,STK_REG(R28)(r1) - ld r8,STK_REG(R29)(r1) - ld r9,STK_REG(R30)(r1) - ld r10,STK_REG(R31)(r1) - - /* setup LR so we return via tracepoint_return */ - LOAD_REG_ADDR(r11,opal_tracepoint_return) - std r11,16(r1) - - mfmsr r12 - DO_OPAL_CALL() - -opal_tracepoint_return: - std r3,STK_REG(R31)(r1) - mr r4,r3 - ld r3,STK_REG(R23)(r1) - bl __trace_opal_exit - ld r3,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) +#ifdef __BIG_ENDIAN__ + ld r11,STK_PARAM(R12)(r1) + mtmsrd r11 +#else + /* Endian can only be switched with rfi, must byte reverse MSR load */ + .short 0x4039 /* li r10,STK_PARAM(R12) */ + .byte (STK_PARAM(R12) >> 8) & 0xff + .byte STK_PARAM(R12) & 0xff + + .long 0x280c6a7d /* ldbrx r11,r10,r1 */ + .long 0x05009f42 /* bcl 20,31,$+4 */ + .long 0xa602487d /* mflr r10 */ + .long 0x14004a39 /* addi r10,r10,20 */ + .long 0xa64b5a7d /* mthsrr0 r10 */ + .long 0xa64b7b7d /* mthsrr1 r11 */ + .long 0x2402004c /* hrfid */ +#endif + ld r2,PACATOC(r13) + ld r0,PPC_LR_STKOFF(r1) mtlr r0 blr -#endif - - -OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); -OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); -OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); -OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); -OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); -OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); -OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); -OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); -OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); -OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); -OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); -OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); -OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); -OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); -OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); -OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); -OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); -OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); -OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); -OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); -OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); -OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); -OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); -OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); -OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); -OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); -OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); -OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); -OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); -OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); -OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); -OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); -OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); -OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); -OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); -OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); -OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); -OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); -OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); -OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); -OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); -OPAL_CALL(opal_start_cpu, OPAL_START_CPU); -OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); -OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); -OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); -OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); -OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); -OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); -OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); -OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); -OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); -OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); -OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); -OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); -OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); -OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); -OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); -OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); -OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); -OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); -OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); -OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); -OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); -OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); -OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); -OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); -OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); -OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); -OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); -OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); -OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); -OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); -OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); -OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); -OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); -OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); -OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); -OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); -OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); -OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); -OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); -OPAL_CALL(opal_get_msg, OPAL_GET_MSG); -OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); -OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); -OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); -OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); -OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); -OPAL_CALL(opal_get_param, OPAL_GET_PARAM); -OPAL_CALL(opal_set_param, OPAL_SET_PARAM); -OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); -OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); -OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); -OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); -OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); -OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); -OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); -OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); -OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); -OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); -OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); -OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); -OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); -OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); -OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); -OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); -OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); -OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); -OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); -OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); -OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); -OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); -OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); -OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); -OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); -OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); -OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); -OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); -OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); -OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); -OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); -OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); -OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); -OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); -OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); -OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); -OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); -OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); -OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); -OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); -OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); -OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); -OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); -OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); -OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); -OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); -OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); -OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); -OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); -OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); -OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); -OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); -OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); -OPAL_CALL(opal_quiesce, OPAL_QUIESCE); -OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); -OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); -OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); -OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); -OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); -OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8e157f9f1ff2..727a7de08635 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -26,7 +26,6 @@ #include <linux/memblock.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/printk.h> #include <linux/kmsg_dump.h> #include <linux/console.h> #include <linux/sched/debug.h> @@ -586,7 +585,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt, user_mode(regs)); + machine_check_print_event_info(&evt, user_mode(regs), false); if (opal_recover_mce(regs, &evt)) return 1; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 697449afb3f7..e28f03e1eb5e 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -313,7 +313,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; tbl->it_nid = nid; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 145373f0e5dc..fa6af52b5219 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1748,7 +1748,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_dma_offset(&pdev->dev, pe->tce_bypass_base); + pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); /* * Note: iommu_add_device() will fail here as @@ -1758,31 +1758,6 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev */ } -static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) -{ - unsigned short vendor = 0; - struct pci_dev *pdev; - - if (pe->device_count == 1) - return true; - - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - if (!pe->pbus) - return true; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { - if (!vendor) { - vendor = pdev->vendor; - continue; - } - - if (pdev->vendor != vendor) - return false; - } - - return true; -} - /* * Reconfigure TVE#0 to be usable as 64-bit DMA space. * @@ -1852,88 +1827,45 @@ err: return -EIO; } -static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, + u64 dma_mask) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - uint64_t top; - bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { - top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); + u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + if (dma_mask >= top) + return true; } - if (bypass) { - dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else { - /* - * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. - */ - if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { - /* - * Fail the request if a DMA mask between 32 and 64 bits - * was requested but couldn't be fulfilled. Ideally we - * would do this for 64-bits but historically we have - * always fallen back to 32-bits. - */ - return -ENOMEM; - } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); - } + /* + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. + */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return rc; + /* 4GB offset bypasses 32-bit space */ + pdev->dev.archdata.dma_offset = (1ULL << 32); + return true; } - *pdev->dev.dma_mask = dma_mask; - /* Update peer npu devices */ - pnv_npu_try_dma_set_bypass(pdev, bypass); - - return 0; -} - -static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - u64 end, mask; - - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->tce_bypass_enabled) - return __dma_get_required_mask(&pdev->dev); - - - end = pe->tce_bypass_base + memblock_end_of_DRAM(); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; + return false; } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) @@ -1942,7 +1874,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - set_dma_offset(&dev->dev, pe->tce_bypass_base); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_bus_dma(pe, dev->subordinate); @@ -2594,8 +2526,13 @@ static long pnv_pci_ioda2_create_table_userspace( int num, __u32 page_shift, __u64 window_size, __u32 levels, struct iommu_table **ptbl) { - return pnv_pci_ioda2_create_table(table_group, + long ret = pnv_pci_ioda2_create_table(table_group, num, page_shift, window_size, levels, true, ptbl); + + if (!ret) + (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( + page_shift, window_size, levels); + return ret; } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) @@ -3661,6 +3598,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .dma_bus_setup = pnv_pci_dma_bus_setup, + .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, @@ -3668,19 +3606,9 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .window_alignment = pnv_pci_window_alignment, .setup_bridge = pnv_pci_setup_bridge, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, .shutdown = pnv_pci_ioda_shutdown, }; -static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) -{ - dev_err_once(&npdev->dev, - "%s operation unsupported for NVLink devices\n", - __func__); - return -EPERM; -} - static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, .setup_msi_irqs = pnv_setup_msi_irqs, @@ -3688,7 +3616,6 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .enable_device_hook = pnv_pci_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, .disable_device = pnv_npu_disable_device, }; @@ -3946,9 +3873,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * shutdown PCI devices correctly. We already got IODA table * cleaned out. So we have to issue PHB reset to stop all PCI * transactions from previous kernel. The ppc_pci_reset_phbs - * kernel parameter will force this reset too. + * kernel parameter will force this reset too. Additionally, + * if the IODA reset above failed then use a bigger hammer. + * This can happen if we get a PHB fatal error in very early + * boot. */ - if (is_kdump_kernel() || pci_reset_phbs) { + if (is_kdump_kernel() || pci_reset_phbs || rc) { pr_info(" Issue PHB reset ...\n"); pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 0d354e19ef92..db09c7022635 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -39,6 +39,7 @@ #include <asm/cpuidle.h> #include <asm/kexec.h> #include <asm/reg.h> +#include <asm/powernv.h> #include "powernv.h" @@ -153,6 +154,7 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; + u64 lpcr_val; /* Standard hot unplug procedure */ /* @@ -174,6 +176,19 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -246,6 +261,16 @@ static void pnv_smp_cpu_kill_self(void) } + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + DBG("CPU%d coming online...\n", cpu); } diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index e7075aaff1bb..59587b75493d 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -354,9 +354,7 @@ static int ps3_setup_storage_dev(const struct ps3_repository_device *repo, repo->dev_index, repo->dev_type, port, blk_size, num_blocks, num_regions); - p = kzalloc(sizeof(struct ps3_storage_device) + - num_regions * sizeof(struct ps3_storage_region), - GFP_KERNEL); + p = kzalloc(struct_size(p, regions, num_regions), GFP_KERNEL); if (!p) { result = -ENOMEM; goto fail_malloc; diff --git a/arch/powerpc/platforms/ps3/os-area.c b/arch/powerpc/platforms/ps3/os-area.c index f5387ad82279..4d65c5380020 100644 --- a/arch/powerpc/platforms/ps3/os-area.c +++ b/arch/powerpc/platforms/ps3/os-area.c @@ -205,11 +205,11 @@ static const struct os_area_db_id os_area_db_id_rtc_diff = { * 3) The number of seconds from 1970 to 2000. */ -struct saved_params { +static struct saved_params { unsigned int valid; s64 rtc_diff; unsigned int av_multi_out; -} static saved_params; +} saved_params; static struct property property_rtc_diff = { .name = "linux,rtc_diff", diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 5cc35d6b94b6..7c227e784247 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -37,12 +37,12 @@ static struct device ps3_system_bus = { }; /* FIXME: need device usage counters! */ -struct { +static struct { struct mutex mutex; int sb_11; /* usb 0 */ int sb_12; /* usb 0 */ int gpu; -} static usage_hack; +} usage_hack; static int ps3_is_device(struct ps3_system_bus_device *dev, u64 bus_id, u64 dev_id) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 2f8e62163602..97feb6e79f1a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -802,6 +802,25 @@ static int dlpar_cpu_add_by_count(u32 cpus_to_add) return rc; } +int dlpar_cpu_readd(int cpu) +{ + struct device_node *dn; + struct device *dev; + u32 drc_index; + int rc; + + dev = get_cpu_device(cpu); + dn = dev->of_node; + + rc = of_property_read_u32(dn, "ibm,my-drc-index", &drc_index); + + rc = dlpar_cpu_remove_by_index(drc_index); + if (!rc) + rc = dlpar_cpu_add(drc_index); + + return rc; +} + int dlpar_cpu(struct pseries_hp_errorlog *hp_elog) { u32 count, drc_index; diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 8fc8fe0b9848..36eb1ddbac69 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -978,7 +978,7 @@ static phys_addr_t ddw_memory_hotplug_max(void) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by dma_set_mask + * returns the dma offset for use by the direct mapped DMA code. */ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) { @@ -1198,87 +1198,37 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) iommu_add_device(pci->table_group, &dev->dev); } -static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) { - bool ddw_enabled = false; - struct device_node *pdn, *dn; - struct pci_dev *pdev; + struct device_node *dn = pci_device_to_OF_node(pdev), *pdn; const __be32 *dma_window = NULL; - u64 dma_offset; - - if (!dev->dma_mask) - return -EIO; - - if (!dev_is_pci(dev)) - goto check_mask; - - pdev = to_pci_dev(dev); /* only attempt to use a new window if 64-bit DMA is requested */ - if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { - dn = pci_device_to_OF_node(pdev); - dev_dbg(dev, "node is %pOF\n", dn); + if (dma_mask < DMA_BIT_MASK(64)) + return false; - /* - * the device tree might contain the dma-window properties - * per-device and not necessarily for the bus. So we need to - * search upwards in the tree until we either hit a dma-window - * property, OR find a parent with a table already allocated. - */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; - pdn = pdn->parent) { - dma_window = of_get_property(pdn, "ibm,dma-window", NULL); - if (dma_window) - break; - } - if (pdn && PCI_DN(pdn)) { - dma_offset = enable_ddw(pdev, pdn); - if (dma_offset != 0) { - dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); - set_dma_offset(dev, dma_offset); - set_dma_ops(dev, &dma_nommu_ops); - ddw_enabled = true; - } - } - } + dev_dbg(&pdev->dev, "node is %pOF\n", dn); - /* fall back on iommu ops */ - if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { - dev_info(dev, "Restoring 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; } -check_mask: - if (!dma_supported(dev, dma_mask)) - return -EIO; - - *dev->dma_mask = dma_mask; - return 0; -} - -static u64 dma_get_required_mask_pSeriesLP(struct device *dev) -{ - if (!dev->dma_mask) - return 0; - - if (!disable_ddw && dev_is_pci(dev)) { - struct pci_dev *pdev = to_pci_dev(dev); - struct device_node *dn; - - dn = pci_device_to_OF_node(pdev); - - /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; - dn = dn->parent) - if (of_get_property(dn, "ibm,dma-window", NULL)) - break; - /* if there is a ibm,ddw-applicable property require 64 bits */ - if (dn && PCI_DN(dn) && - of_get_property(dn, "ibm,ddw-applicable", NULL)) - return DMA_BIT_MASK(64); + if (pdn && PCI_DN(pdn)) { + pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); + if (pdev->dev.archdata.dma_offset) + return true; } - return dma_iommu_ops.get_required_mask(dev); + return false; } static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, @@ -1373,8 +1323,9 @@ void iommu_init_early_pSeries(void) if (firmware_has_feature(FW_FEATURE_LPAR)) { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; - ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; - ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; + if (!disable_ddw) + pseries_pci_controller_ops.iommu_bypass_supported = + iommu_bypass_supported_pSeriesLP; } else { pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 794487313cc8..e73c7e30efe6 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -475,6 +475,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) splpar_dispatch_data(m); seq_printf(m, "purr=%ld\n", get_purr()); + seq_printf(m, "tbr=%ld\n", mftb()); } else { /* non SPLPAR case */ seq_printf(m, "system_active_processors=%d\n", diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 1fad4649735b..141795275ccb 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -492,7 +492,9 @@ static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size, return NULL; } - ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs); + ret = iommu_alloc_coherent(dev, get_iommu_table_base(dev), size, + dma_handle, dev->coherent_dma_mask, flag, + dev_to_node(dev)); if (unlikely(ret == NULL)) { vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); atomic_inc(&viodev->cmo.allocs_failed); @@ -507,8 +509,7 @@ static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, { struct vio_dev *viodev = to_vio_dev(dev); - dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs); - + iommu_free_coherent(get_iommu_table_base(dev), size, vaddr, dma_handle); vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE)); } @@ -518,22 +519,22 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); dma_addr_t ret = DMA_MAPPING_ERROR; - tbl = get_iommu_table_base(dev); - if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) { - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } - - ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs); - if (unlikely(dma_mapping_error(dev, ret))) { - vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); - atomic_inc(&viodev->cmo.allocs_failed); - } - + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) + goto out_fail; + ret = iommu_map_page(dev, tbl, page, offset, size, device_to_mask(dev), + direction, attrs); + if (unlikely(ret == DMA_MAPPING_ERROR)) + goto out_deallocate; return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return DMA_MAPPING_ERROR; } static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, @@ -542,11 +543,9 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; - - tbl = get_iommu_table_base(dev); - dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs); + struct iommu_table *tbl = get_iommu_table_base(dev); + iommu_unmap_page(tbl, dma_handle, size, direction, attrs); vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl))); } @@ -555,34 +554,32 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; int ret, count; size_t alloc_size = 0; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl)); - if (vio_cmo_alloc(viodev, alloc_size)) { - atomic_inc(&viodev->cmo.allocs_failed); - return 0; - } - - ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); - - if (unlikely(!ret)) { - vio_cmo_dealloc(viodev, alloc_size); - atomic_inc(&viodev->cmo.allocs_failed); - return ret; - } + if (vio_cmo_alloc(viodev, alloc_size)) + goto out_fail; + ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, device_to_mask(dev), + direction, attrs); + if (unlikely(!ret)) + goto out_deallocate; for_each_sg(sglist, sgl, ret, count) alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); if (alloc_size) vio_cmo_dealloc(viodev, alloc_size); - return ret; + +out_deallocate: + vio_cmo_dealloc(viodev, alloc_size); +out_fail: + atomic_inc(&viodev->cmo.allocs_failed); + return 0; } static void vio_dma_iommu_unmap_sg(struct device *dev, @@ -591,40 +588,27 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, unsigned long attrs) { struct vio_dev *viodev = to_vio_dev(dev); - struct iommu_table *tbl; + struct iommu_table *tbl = get_iommu_table_base(dev); struct scatterlist *sgl; size_t alloc_size = 0; int count; - tbl = get_iommu_table_base(dev); for_each_sg(sglist, sgl, nelems, count) alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl)); - dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); - + ppc_iommu_unmap_sg(tbl, sglist, nelems, direction, attrs); vio_cmo_dealloc(viodev, alloc_size); } -static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask) -{ - return dma_iommu_ops.dma_supported(dev, mask); -} - -static u64 vio_dma_get_required_mask(struct device *dev) -{ - return dma_iommu_ops.get_required_mask(dev); -} - static const struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, - .mmap = dma_nommu_mmap_coherent, .map_sg = vio_dma_iommu_map_sg, .unmap_sg = vio_dma_iommu_unmap_sg, .map_page = vio_dma_iommu_map_page, .unmap_page = vio_dma_iommu_unmap_page, - .dma_supported = vio_dma_iommu_dma_supported, - .get_required_mask = vio_dma_get_required_mask, + .dma_supported = dma_iommu_dma_supported, + .get_required_mask = dma_iommu_get_required_mask, }; /** @@ -1715,3 +1699,10 @@ int vio_disable_interrupts(struct vio_dev *dev) } EXPORT_SYMBOL(vio_disable_interrupts); #endif /* CONFIG_PPC_PSERIES */ + +static int __init vio_init(void) +{ + dma_debug_add_bus(&vio_bus_type); + return 0; +} +fs_initcall(vio_init); diff --git a/arch/powerpc/sysdev/6xx-suspend.S b/arch/powerpc/sysdev/6xx-suspend.S index cf48e9cb2575..6c4aec25c4ba 100644 --- a/arch/powerpc/sysdev/6xx-suspend.S +++ b/arch/powerpc/sysdev/6xx-suspend.S @@ -29,10 +29,9 @@ _GLOBAL(mpc6xx_enter_standby) ori r5, r5, ret_from_standby@l mtlr r5 - CURRENT_THREAD_INFO(r5, r1) - lwz r6, TI_LOCAL_FLAGS(r5) + lwz r6, TI_LOCAL_FLAGS(r2) ori r6, r6, _TLF_SLEEPING - stw r6, TI_LOCAL_FLAGS(r5) + stw r6, TI_LOCAL_FLAGS(r2) mfmsr r5 ori r5, r5, MSR_EE diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 25bc25fe0d93..fc5c5c23303e 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -363,13 +363,6 @@ static void iommu_table_dart_setup(void) set_bit(iommu_table_dart.it_size - 1, iommu_table_dart.it_map); } -static void pci_dma_dev_setup_dart(struct pci_dev *dev) -{ - if (dart_is_u4) - set_dma_offset(&dev->dev, DART_U4_BYPASS_BASE); - set_iommu_table_base(&dev->dev, &iommu_table_dart); -} - static void pci_dma_bus_setup_dart(struct pci_bus *bus) { if (!iommu_table_dart_inited) { @@ -393,27 +386,18 @@ static bool dart_device_on_pcie(struct device *dev) return false; } -static int dart_dma_set_mask(struct device *dev, u64 dma_mask) +static void pci_dma_dev_setup_dart(struct pci_dev *dev) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - - /* U4 supports a DART bypass, we use it for 64-bit capable - * devices to improve performances. However, that only works - * for devices connected to U4 own PCIe interface, not bridged - * through hypertransport. We need the device to support at - * least 40 bits of addresses. - */ - if (dart_device_on_pcie(dev) && dma_mask >= DMA_BIT_MASK(40)) { - dev_info(dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(dev, &dma_nommu_ops); - } else { - dev_info(dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(dev, &dma_iommu_ops); - } + if (dart_is_u4 && dart_device_on_pcie(&dev->dev)) + dev->dev.archdata.dma_offset = DART_U4_BYPASS_BASE; + set_iommu_table_base(&dev->dev, &iommu_table_dart); +} - *dev->dma_mask = dma_mask; - return 0; +static bool iommu_bypass_supported_dart(struct pci_dev *dev, u64 mask) +{ + return dart_is_u4 && + dart_device_on_pcie(&dev->dev) && + mask >= DMA_BIT_MASK(40); } void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) @@ -431,26 +415,20 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) /* Initialize the DART HW */ if (dart_init(dn) != 0) - goto bail; - - /* Setup bypass if supported */ - if (dart_is_u4) - ppc_md.dma_set_mask = dart_dma_set_mask; + return; + /* + * U4 supports a DART bypass, we use it for 64-bit capable devices to + * improve performance. However, that only works for devices connected + * to the U4 own PCIe interface, not bridged through hypertransport. + * We need the device to support at least 40 bits of addresses. + */ controller_ops->dma_dev_setup = pci_dma_dev_setup_dart; controller_ops->dma_bus_setup = pci_dma_bus_setup_dart; + controller_ops->iommu_bypass_supported = iommu_bypass_supported_dart; /* Setup pci_dma ops */ set_pci_dma_ops(&dma_iommu_ops); - return; - - bail: - /* If init failed, use direct iommu and null setup functions */ - controller_ops->dma_dev_setup = NULL; - controller_ops->dma_bus_setup = NULL; - - /* Setup pci_dma ops */ - set_pci_dma_ops(&dma_nommu_ops); } #ifdef CONFIG_PM diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 918be816b097..f49aec251a5a 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -40,6 +40,7 @@ #include <asm/mpc85xx.h> #include <asm/disassemble.h> #include <asm/ppc-opcode.h> +#include <asm/swiotlb.h> #include <sysdev/fsl_soc.h> #include <sysdev/fsl_pci.h> @@ -114,33 +115,33 @@ static struct pci_ops fsl_indirect_pcie_ops = static u64 pci64_dma_offset; #ifdef CONFIG_SWIOTLB +static void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + + pdev->dev.bus_dma_mask = + hose->dma_window_base_cur + hose->dma_window_size; +} + static void setup_swiotlb_ops(struct pci_controller *hose) { - if (ppc_swiotlb_enable) { + if (ppc_swiotlb_enable) hose->controller_ops.dma_dev_setup = pci_dma_dev_setup_swiotlb; - set_pci_dma_ops(&powerpc_swiotlb_dma_ops); - } } #else static inline void setup_swiotlb_ops(struct pci_controller *hose) {} #endif -static int fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) +static void fsl_pci_dma_set_mask(struct device *dev, u64 dma_mask) { - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - /* * Fix up PCI devices that are able to DMA to the large inbound * mapping that allows addressing any RAM address from across PCI. */ if (dev_is_pci(dev) && dma_mask >= pci64_dma_offset * 2 - 1) { - set_dma_ops(dev, &dma_nommu_ops); - set_dma_offset(dev, pci64_dma_offset); + dev->bus_dma_mask = 0; + dev->archdata.dma_offset = pci64_dma_offset; } - - *dev->dma_mask = dma_mask; - return 0; } static int setup_one_atmu(struct ccsr_pci __iomem *pci, diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 8030a0f55e96..fd129c8ecceb 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -771,21 +771,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) return ipic; } -void ipic_set_highest_priority(unsigned int virq) -{ - struct ipic *ipic = ipic_from_irq(virq); - unsigned int src = virq_to_hw(virq); - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SICFR); - - /* clear and set HPI */ - temp &= 0x7f000000; - temp |= (src & 0x7f) << 24; - - ipic_write(ipic->regs, IPIC_SICFR, temp); -} - void ipic_set_default_priority(void) { ipic_write(primary_ipic->regs, IPIC_SIPRR_A, IPIC_PRIORITY_DEFAULT); @@ -796,26 +781,6 @@ void ipic_set_default_priority(void) ipic_write(primary_ipic->regs, IPIC_SMPRR_B, IPIC_PRIORITY_DEFAULT); } -void ipic_enable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp |= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - -void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) -{ - struct ipic *ipic = primary_ipic; - u32 temp; - - temp = ipic_read(ipic->regs, IPIC_SERMR); - temp &= (1 << (31 - mcp_irq)); - ipic_write(ipic->regs, IPIC_SERMR, temp); -} - u32 ipic_get_mcp_status(void) { return primary_ipic ? ipic_read(primary_ipic->regs, IPIC_SERSR) : 0; diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index 1fd0717ade02..1f1af12f23e2 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -51,7 +51,7 @@ phys_addr_t get_csrbase(void) const void *prop = of_get_property(tsi, "reg", &size); tsi108_csr_base = of_translate_address(tsi, prop); of_node_put(tsi); - }; + } return tsi108_csr_base; } diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 94a69a62f5db..70a8f9e31a2d 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -442,7 +442,7 @@ static void xive_dec_target_count(int cpu) struct xive_cpu *xc = per_cpu(xive_cpu, cpu); struct xive_q *q = &xc->queue[xive_irq_priority]; - if (unlikely(WARN_ON(cpu < 0 || !xc))) { + if (WARN_ON(cpu < 0 || !xc)) { pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); return; } diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 878f9c1d3615..3050f9323254 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -5,6 +5,7 @@ subdir-ccflags-y := $(call cc-disable-warning, builtin-requires-header) GCOV_PROFILE := n +KCOV_INSTRUMENT := n UBSAN_SANITIZE := n # Disable ftrace for the entire directory diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c index 9deea5ee13f6..27f1e6415036 100644 --- a/arch/powerpc/xmon/ppc-dis.c +++ b/arch/powerpc/xmon/ppc-dis.c @@ -158,7 +158,7 @@ int print_insn_powerpc (unsigned long insn, unsigned long memaddr) dialect |= (PPC_OPCODE_POWER5 | PPC_OPCODE_POWER6 | PPC_OPCODE_POWER7 | PPC_OPCODE_POWER8 | PPC_OPCODE_POWER9 | PPC_OPCODE_HTM | PPC_OPCODE_ALTIVEC | PPC_OPCODE_ALTIVEC2 - | PPC_OPCODE_VSX | PPC_OPCODE_VSX3), + | PPC_OPCODE_VSX | PPC_OPCODE_VSX3); /* Get the major opcode of the insn. */ opcode = NULL; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 757b8499aba2..a0f44f992360 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2997,7 +2997,7 @@ static void show_task(struct task_struct *tsk) printf("%px %016lx %6d %6d %c %2d %s\n", tsk, tsk->thread.ksp, tsk->pid, rcu_dereference(tsk->parent)->pid, - state, task_thread_info(tsk)->cpu, + state, task_cpu(tsk), tsk->comm); } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bd149905a5b5..b41311f6a94f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -90,14 +90,14 @@ config GENERIC_CSUM config GENERIC_HWEIGHT def_bool y +config FIX_EARLYCON_MEM + def_bool y + config PGTABLE_LEVELS int default 3 if 64BIT default 2 -config HAVE_KPROBES - def_bool n - menu "Platform type" choice diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h new file mode 100644 index 000000000000..57afe604b495 --- /dev/null +++ b/arch/riscv/include/asm/fixmap.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + */ + +#ifndef _ASM_RISCV_FIXMAP_H +#define _ASM_RISCV_FIXMAP_H + +#include <linux/kernel.h> +#include <linux/sizes.h> +#include <asm/page.h> +#include <asm/pgtable.h> + +/* + * Here we define all the compile-time 'special' virtual addresses. + * The point is to have a constant address at compile time, but to + * set the physical address only in the boot process. + * + * These 'compile-time allocated' memory buffers are page-sized. Use + * set_fixmap(idx,phys) to associate physical memory with fixmap indices. + */ +enum fixed_addresses { + FIX_HOLE, + FIX_EARLYCON_MEM_BASE, + __end_of_fixed_addresses +}; + +#define FIXADDR_SIZE (__end_of_fixed_addresses * PAGE_SIZE) +#define FIXADDR_TOP (PAGE_OFFSET) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#define FIXMAP_PAGE_IO PAGE_KERNEL + +#define __early_set_fixmap __set_fixmap + +#define __late_set_fixmap __set_fixmap +#define __late_clear_fixmap(idx) __set_fixmap((idx), 0, FIXMAP_PAGE_CLEAR) + +extern void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot); + +#include <asm-generic/fixmap.h> + +#endif /* _ASM_RISCV_FIXMAP_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a8179a8c1491..1141364d990e 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -404,6 +404,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define kern_addr_valid(addr) (1) /* FIXME */ #endif +extern void setup_bootmem(void); extern void paging_init(void); static inline void pgtable_cache_init(void) diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index 41aa73b476f4..636a934f013a 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -19,16 +19,17 @@ #include <linux/thread_info.h> #define INVALID_HARTID ULONG_MAX + +struct seq_file; +extern unsigned long boot_cpu_hartid; + +#ifdef CONFIG_SMP /* * Mapping between linux logical cpu index and hartid. */ extern unsigned long __cpuid_to_hartid_map[NR_CPUS]; #define cpuid_to_hartid_map(cpu) __cpuid_to_hartid_map[cpu] -struct seq_file; - -#ifdef CONFIG_SMP - /* print IPI stats */ void show_ipi_stats(struct seq_file *p, int prec); @@ -58,7 +59,14 @@ static inline void show_ipi_stats(struct seq_file *p, int prec) static inline int riscv_hartid_to_cpuid(int hartid) { - return 0; + if (hartid == boot_cpu_hartid) + return 0; + + return -1; +} +static inline unsigned long cpuid_to_hartid_map(int cpu) +{ + return boot_cpu_hartid; } static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index f8fa2c63aa89..cf2fca12414a 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -17,44 +17,36 @@ #include <asm/smp.h> /* - * Returns the hart ID of the given device tree node, or -1 if the device tree - * node isn't a RISC-V hart. + * Returns the hart ID of the given device tree node, or -ENODEV if the node + * isn't an enabled and valid RISC-V hart node. */ int riscv_of_processor_hartid(struct device_node *node) { - const char *isa, *status; + const char *isa; u32 hart; if (!of_device_is_compatible(node, "riscv")) { pr_warn("Found incompatible CPU\n"); - return -(ENODEV); + return -ENODEV; } if (of_property_read_u32(node, "reg", &hart)) { pr_warn("Found CPU without hart ID\n"); - return -(ENODEV); - } - if (hart >= NR_CPUS) { - pr_info("Found hart ID %d, which is above NR_CPUs. Disabling this hart\n", hart); - return -(ENODEV); + return -ENODEV; } - if (of_property_read_string(node, "status", &status)) { - pr_warn("CPU with hartid=%d has no \"status\" property\n", hart); - return -(ENODEV); - } - if (strcmp(status, "okay")) { - pr_info("CPU with hartid=%d has a non-okay status of \"%s\"\n", hart, status); - return -(ENODEV); + if (!of_device_is_available(node)) { + pr_info("CPU with hartid=%d is not available\n", hart); + return -ENODEV; } if (of_property_read_string(node, "riscv,isa", &isa)) { pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart); - return -(ENODEV); + return -ENODEV; } if (isa[0] != 'r' || isa[1] != 'v') { pr_warn("CPU with hartid=%d has an invalid ISA of \"%s\"\n", hart, isa); - return -(ENODEV); + return -ENODEV; } return hart; @@ -106,7 +98,7 @@ static void print_isa(struct seq_file *f, const char *orig_isa) * a bit of info describing what went wrong. */ if (isa[0] != '\0') - pr_info("unsupported ISA \"%s\" in device tree", orig_isa); + pr_info("unsupported ISA \"%s\" in device tree\n", orig_isa); } static void print_mmu(struct seq_file *f, const char *mmu_type) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index a6e369edbbd7..bc29b010b722 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -20,6 +20,7 @@ #include <linux/of.h> #include <asm/processor.h> #include <asm/hwcap.h> +#include <asm/smp.h> unsigned long elf_hwcap __read_mostly; #ifdef CONFIG_FPU @@ -28,7 +29,7 @@ bool has_fpu __read_mostly; void riscv_fill_hwcap(void) { - struct device_node *node = NULL; + struct device_node *node; const char *isa; size_t i; static unsigned long isa2hwcap[256] = {0}; @@ -42,36 +43,39 @@ void riscv_fill_hwcap(void) elf_hwcap = 0; - /* - * We don't support running Linux on hertergenous ISA systems. For - * now, we just check the ISA of the first "okay" processor. - */ - while ((node = of_find_node_by_type(node, "cpu"))) - if (riscv_of_processor_hartid(node) >= 0) - break; - if (!node) { - pr_warning("Unable to find \"cpu\" devicetree entry"); - return; - } + for_each_of_cpu_node(node) { + unsigned long this_hwcap = 0; - if (of_property_read_string(node, "riscv,isa", &isa)) { - pr_warning("Unable to find \"riscv,isa\" devicetree entry"); - of_node_put(node); - return; - } - of_node_put(node); + if (riscv_of_processor_hartid(node) < 0) + continue; + + if (of_property_read_string(node, "riscv,isa", &isa)) { + pr_warn("Unable to find \"riscv,isa\" devicetree entry\n"); + continue; + } - for (i = 0; i < strlen(isa); ++i) - elf_hwcap |= isa2hwcap[(unsigned char)(isa[i])]; + for (i = 0; i < strlen(isa); ++i) + this_hwcap |= isa2hwcap[(unsigned char)(isa[i])]; + + /* + * All "okay" hart should have same isa. Set HWCAP based on + * common capabilities of every "okay" hart, in case they don't + * have. + */ + if (elf_hwcap) + elf_hwcap &= this_hwcap; + else + elf_hwcap = this_hwcap; + } /* We don't support systems with F but without D, so mask those out * here. */ if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) { - pr_info("This kernel does not support systems with F but not D"); + pr_info("This kernel does not support systems with F but not D\n"); elf_hwcap &= ~COMPAT_HWCAP_ISA_F; } - pr_info("elf_hwcap is 0x%lx", elf_hwcap); + pr_info("elf_hwcap is 0x%lx\n", elf_hwcap); #ifdef CONFIG_FPU if (elf_hwcap & (COMPAT_HWCAP_ISA_F | COMPAT_HWCAP_ISA_D)) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index a840b7d074f7..b94d8db5ddcc 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -32,7 +32,7 @@ static int ftrace_check_current_call(unsigned long hook_pos, * return must be -EINVAL on failed comparison */ if (memcmp(expected, replaced, sizeof(replaced))) { - pr_err("%p: expected (%08x %08x) but get (%08x %08x)", + pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n", (void *)hook_pos, expected[0], expected[1], replaced[0], replaced[1]); return -EINVAL; diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 77564310235f..ecb654f6a79e 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -23,7 +23,6 @@ #include <linux/mm.h> #include <linux/memblock.h> #include <linux/sched.h> -#include <linux/initrd.h> #include <linux/console.h> #include <linux/screen_info.h> #include <linux/of_fdt.h> @@ -61,95 +60,9 @@ EXPORT_SYMBOL(empty_zero_page); atomic_t hart_lottery; unsigned long boot_cpu_hartid; -unsigned long __cpuid_to_hartid_map[NR_CPUS] = { - [0 ... NR_CPUS-1] = INVALID_HARTID -}; - -void __init smp_setup_processor_id(void) -{ - cpuid_to_hartid_map(0) = boot_cpu_hartid; -} - -#ifdef CONFIG_BLK_DEV_INITRD -static void __init setup_initrd(void) -{ - unsigned long size; - - if (initrd_start >= initrd_end) { - printk(KERN_INFO "initrd not found or empty"); - goto disable; - } - if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { - printk(KERN_ERR "initrd extends beyond end of memory"); - goto disable; - } - - size = initrd_end - initrd_start; - memblock_reserve(__pa(initrd_start), size); - initrd_below_start_ok = 1; - - printk(KERN_INFO "Initial ramdisk at: 0x%p (%lu bytes)\n", - (void *)(initrd_start), size); - return; -disable: - pr_cont(" - disabling initrd\n"); - initrd_start = 0; - initrd_end = 0; -} -#endif /* CONFIG_BLK_DEV_INITRD */ - -pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; -pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - -#ifndef __PAGETABLE_PMD_FOLDED -#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT) -pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss; -pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); -#endif - -asmlinkage void __init setup_vm(void) -{ - extern char _start; - uintptr_t i; - uintptr_t pa = (uintptr_t) &_start; - pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC); - - va_pa_offset = PAGE_OFFSET - pa; - pfn_base = PFN_DOWN(pa); - - /* Sanity check alignment and size */ - BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); - BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0); - -#ifndef __PAGETABLE_PMD_FOLDED - trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] = - pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd), - __pgprot(_PAGE_TABLE)); - trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot); - - for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) { - size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i; - swapper_pg_dir[o] = - pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i, - __pgprot(_PAGE_TABLE)); - } - for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++) - swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot); -#else - trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] = - pfn_pgd(PFN_DOWN(pa), prot); - - for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) { - size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i; - swapper_pg_dir[o] = - pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot); - } -#endif -} - void __init parse_dtb(unsigned int hartid, void *dtb) { - if (!early_init_dt_scan(__va(dtb))) + if (early_init_dt_scan(__va(dtb))) return; pr_err("No DTB passed to the kernel\n"); @@ -159,60 +72,17 @@ void __init parse_dtb(unsigned int hartid, void *dtb) #endif } -static void __init setup_bootmem(void) -{ - struct memblock_region *reg; - phys_addr_t mem_size = 0; - - /* Find the memory region containing the kernel */ - for_each_memblock(memory, reg) { - phys_addr_t vmlinux_end = __pa(_end); - phys_addr_t end = reg->base + reg->size; - - if (reg->base <= vmlinux_end && vmlinux_end <= end) { - /* - * Reserve from the start of the region to the end of - * the kernel - */ - memblock_reserve(reg->base, vmlinux_end - reg->base); - mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET); - } - } - BUG_ON(mem_size == 0); - - set_max_mapnr(PFN_DOWN(mem_size)); - max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); - -#ifdef CONFIG_BLK_DEV_INITRD - setup_initrd(); -#endif /* CONFIG_BLK_DEV_INITRD */ - - early_init_fdt_reserve_self(); - early_init_fdt_scan_reserved_mem(); - memblock_allow_resize(); - memblock_dump_all(); - - for_each_memblock(memory, reg) { - unsigned long start_pfn = memblock_region_memory_base_pfn(reg); - unsigned long end_pfn = memblock_region_memory_end_pfn(reg); - - memblock_set_node(PFN_PHYS(start_pfn), - PFN_PHYS(end_pfn - start_pfn), - &memblock.memory, 0); - } -} - void __init setup_arch(char **cmdline_p) { - *cmdline_p = boot_command_line; - - parse_early_param(); - init_mm.start_code = (unsigned long) _stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = (unsigned long) _end; + *cmdline_p = boot_command_line; + + parse_early_param(); + setup_bootmem(); paging_init(); unflatten_device_tree(); @@ -231,4 +101,3 @@ void __init setup_arch(char **cmdline_p) riscv_fill_hwcap(); } - diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 246635eac7bb..0c41d07ec281 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -36,6 +36,15 @@ enum ipi_message_type { IPI_MAX }; +unsigned long __cpuid_to_hartid_map[NR_CPUS] = { + [0 ... NR_CPUS-1] = INVALID_HARTID +}; + +void __init smp_setup_processor_id(void) +{ + cpuid_to_hartid_map(0) = boot_cpu_hartid; +} + /* A collection of single bit ipi messages. */ static struct { unsigned long stats[IPI_MAX] ____cacheline_aligned; @@ -51,7 +60,6 @@ int riscv_hartid_to_cpuid(int hartid) return i; pr_err("Couldn't find cpu id for hartid [%d]\n", hartid); - BUG(); return i; } diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 18cda0e8cf94..eb533b5c2c8c 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -39,6 +39,7 @@ void *__cpu_up_stack_pointer[NR_CPUS]; void *__cpu_up_task_pointer[NR_CPUS]; +static DECLARE_COMPLETION(cpu_running); void __init smp_prepare_boot_cpu(void) { @@ -50,12 +51,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init setup_smp(void) { - struct device_node *dn = NULL; + struct device_node *dn; int hart; bool found_boot_cpu = false; int cpuid = 1; - while ((dn = of_find_node_by_type(dn, "cpu"))) { + for_each_of_cpu_node(dn) { hart = riscv_of_processor_hartid(dn); if (hart < 0) continue; @@ -65,6 +66,11 @@ void __init setup_smp(void) found_boot_cpu = 1; continue; } + if (cpuid >= NR_CPUS) { + pr_warn("Invalid cpuid [%d] for hartid [%d]\n", + cpuid, hart); + break; + } cpuid_to_hartid_map(cpuid) = hart; set_cpu_possible(cpuid, true); @@ -77,6 +83,7 @@ void __init setup_smp(void) int __cpu_up(unsigned int cpu, struct task_struct *tidle) { + int ret = 0; int hartid = cpuid_to_hartid_map(cpu); tidle->thread_info.cpu = cpu; @@ -92,10 +99,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) task_stack_page(tidle) + THREAD_SIZE); WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); - while (!cpu_online(cpu)) - cpu_relax(); + lockdep_assert_held(&cpu_running); + wait_for_completion_timeout(&cpu_running, + msecs_to_jiffies(1000)); + + if (!cpu_online(cpu)) { + pr_crit("CPU%u: failed to come online\n", cpu); + ret = -EIO; + } - return 0; + return ret; } void __init smp_cpus_done(unsigned int max_cpus) @@ -121,6 +134,7 @@ asmlinkage void __init smp_callin(void) * a local TLB flush right now just in case. */ local_flush_tlb_all(); + complete(&cpu_running); /* * Disable preemption before enabling interrupts, so we don't try to * schedule a CPU that hasn't actually started yet. diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 658ebf645f42..b379a75ac6a6 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -17,7 +17,9 @@ #include <linux/initrd.h> #include <linux/swap.h> #include <linux/sizes.h> +#include <linux/of_fdt.h> +#include <asm/fixmap.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/pgtable.h> @@ -66,7 +68,159 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +static void __init setup_initrd(void) { + unsigned long size; + + if (initrd_start >= initrd_end) { + pr_info("initrd not found or empty"); + goto disable; + } + if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + pr_err("initrd extends beyond end of memory"); + goto disable; + } + + size = initrd_end - initrd_start; + memblock_reserve(__pa(initrd_start), size); + initrd_below_start_ok = 1; + + pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", + (void *)(initrd_start), size); + return; +disable: + pr_cont(" - disabling initrd\n"); + initrd_start = 0; + initrd_end = 0; +} + +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif /* CONFIG_BLK_DEV_INITRD */ + +void __init setup_bootmem(void) +{ + struct memblock_region *reg; + phys_addr_t mem_size = 0; + + /* Find the memory region containing the kernel */ + for_each_memblock(memory, reg) { + phys_addr_t vmlinux_end = __pa(_end); + phys_addr_t end = reg->base + reg->size; + + if (reg->base <= vmlinux_end && vmlinux_end <= end) { + /* + * Reserve from the start of the region to the end of + * the kernel + */ + memblock_reserve(reg->base, vmlinux_end - reg->base); + mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET); + } + } + BUG_ON(mem_size == 0); + + set_max_mapnr(PFN_DOWN(mem_size)); + max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); + +#ifdef CONFIG_BLK_DEV_INITRD + setup_initrd(); +#endif /* CONFIG_BLK_DEV_INITRD */ + + early_init_fdt_reserve_self(); + early_init_fdt_scan_reserved_mem(); + memblock_allow_resize(); + memblock_dump_all(); + + for_each_memblock(memory, reg) { + unsigned long start_pfn = memblock_region_memory_base_pfn(reg); + unsigned long end_pfn = memblock_region_memory_end_pfn(reg); + + memblock_set_node(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, 0); + } +} + +pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +pgd_t trampoline_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); + +#ifndef __PAGETABLE_PMD_FOLDED +#define NUM_SWAPPER_PMDS ((uintptr_t)-PAGE_OFFSET >> PGDIR_SHIFT) +pmd_t swapper_pmd[PTRS_PER_PMD*((-PAGE_OFFSET)/PGDIR_SIZE)] __page_aligned_bss; +pmd_t trampoline_pmd[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; +#endif + +pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; + +void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) +{ + unsigned long addr = __fix_to_virt(idx); + pte_t *ptep; + + BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); + + ptep = &fixmap_pte[pte_index(addr)]; + + if (pgprot_val(prot)) { + set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot)); + } else { + pte_clear(&init_mm, addr, ptep); + local_flush_tlb_page(addr); + } +} + +asmlinkage void __init setup_vm(void) +{ + extern char _start; + uintptr_t i; + uintptr_t pa = (uintptr_t) &_start; + pgprot_t prot = __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_EXEC); + + va_pa_offset = PAGE_OFFSET - pa; + pfn_base = PFN_DOWN(pa); + + /* Sanity check alignment and size */ + BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); + BUG_ON((pa % (PAGE_SIZE * PTRS_PER_PTE)) != 0); + +#ifndef __PAGETABLE_PMD_FOLDED + trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] = + pfn_pgd(PFN_DOWN((uintptr_t)trampoline_pmd), + __pgprot(_PAGE_TABLE)); + trampoline_pmd[0] = pfn_pmd(PFN_DOWN(pa), prot); + + for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) { + size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i; + + swapper_pg_dir[o] = + pfn_pgd(PFN_DOWN((uintptr_t)swapper_pmd) + i, + __pgprot(_PAGE_TABLE)); + } + for (i = 0; i < ARRAY_SIZE(swapper_pmd); i++) + swapper_pmd[i] = pfn_pmd(PFN_DOWN(pa + i * PMD_SIZE), prot); + + swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] = + pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pmd), + __pgprot(_PAGE_TABLE)); + fixmap_pmd[(FIXADDR_START >> PMD_SHIFT) % PTRS_PER_PMD] = + pfn_pmd(PFN_DOWN((uintptr_t)fixmap_pte), + __pgprot(_PAGE_TABLE)); +#else + trampoline_pg_dir[(PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD] = + pfn_pgd(PFN_DOWN(pa), prot); + + for (i = 0; i < (-PAGE_OFFSET)/PGDIR_SIZE; ++i) { + size_t o = (PAGE_OFFSET >> PGDIR_SHIFT) % PTRS_PER_PGD + i; + + swapper_pg_dir[o] = + pfn_pgd(PFN_DOWN(pa + i * PGDIR_SIZE), prot); + } + + swapper_pg_dir[(FIXADDR_START >> PGDIR_SHIFT) % PTRS_PER_PGD] = + pfn_pgd(PFN_DOWN((uintptr_t)fixmap_pte), + __pgprot(_PAGE_TABLE)); +#endif +} diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 9c5a67d1b9c1..2d8b9d8ca4f8 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -187,7 +187,6 @@ cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc, cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) # does binutils support specific instructions? -asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) @@ -217,6 +216,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) + # Additionally, avoid generating expensive indirect jumps which + # are subject to retpolines for small number of switch cases. + # clang turns off jump table generation by default when under + # retpoline builds, however, gcc does not for x86. + KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) endif archscripts: scripts_basic diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9b5adae9cc40..e2839b5c246c 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h -LDFLAGS_setup.elf := -T +LDFLAGS_setup.elf := -m elf_i386 -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index f0515ac895a4..6b84afdd7538 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -84,6 +84,8 @@ ifdef CONFIG_X86_64 vmlinux-objs-y += $(obj)/pgtable_64.o endif +vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o + $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \ diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c new file mode 100644 index 000000000000..0ef4ad55b29b --- /dev/null +++ b/arch/x86/boot/compressed/acpi.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0 +#define BOOT_CTYPE_H +#include "misc.h" +#include "error.h" +#include "../string.h" + +#include <linux/numa.h> +#include <linux/efi.h> +#include <asm/efi.h> + +/* + * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0' + * for termination. + */ +#define MAX_ACPI_ARG_LENGTH 10 + +/* + * Immovable memory regions representation. Max amount of memory regions is + * MAX_NUMNODES*2. + */ +struct mem_vector immovable_mem[MAX_NUMNODES*2]; + +/* + * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex + * digits, and '\0' for termination. + */ +#define MAX_ADDR_LEN 19 + +static acpi_physical_address get_acpi_rsdp(void) +{ + acpi_physical_address addr = 0; + +#ifdef CONFIG_KEXEC + char val[MAX_ADDR_LEN] = { }; + int ret; + + ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); + if (ret < 0) + return 0; + + if (kstrtoull(val, 16, &addr)) + return 0; +#endif + return addr; +} + +/* Search EFI system tables for RSDP. */ +static acpi_physical_address efi_get_rsdp_addr(void) +{ + acpi_physical_address rsdp_addr = 0; + +#ifdef CONFIG_EFI + unsigned long systab, systab_tables, config_tables; + unsigned int nr_tables; + struct efi_info *ei; + bool efi_64; + int size, i; + char *sig; + + ei = &boot_params->efi_info; + sig = (char *)&ei->efi_loader_signature; + + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + efi_64 = true; + } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) { + efi_64 = false; + } else { + debug_putstr("Wrong EFI loader signature.\n"); + return 0; + } + + /* Get systab from boot params. */ +#ifdef CONFIG_X86_64 + systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32); +#else + if (ei->efi_systab_hi || ei->efi_memmap_hi) { + debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n"); + return 0; + } + systab = ei->efi_systab; +#endif + if (!systab) + error("EFI system table not found."); + + /* Handle EFI bitness properly */ + if (efi_64) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab; + + config_tables = stbl->tables; + nr_tables = stbl->nr_tables; + size = sizeof(efi_config_table_64_t); + } else { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; + + config_tables = stbl->tables; + nr_tables = stbl->nr_tables; + size = sizeof(efi_config_table_32_t); + } + + if (!config_tables) + error("EFI config tables not found."); + + /* Get EFI tables from systab. */ + for (i = 0; i < nr_tables; i++) { + acpi_physical_address table; + efi_guid_t guid; + + config_tables += size; + + if (efi_64) { + efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables; + + guid = tbl->guid; + table = tbl->table; + + if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { + debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); + return 0; + } + } else { + efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables; + + guid = tbl->guid; + table = tbl->table; + } + + if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) + rsdp_addr = table; + else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) + return table; + } +#endif + return rsdp_addr; +} + +static u8 compute_checksum(u8 *buffer, u32 length) +{ + u8 *end = buffer + length; + u8 sum = 0; + + while (buffer < end) + sum += *(buffer++); + + return sum; +} + +/* Search a block of memory for the RSDP signature. */ +static u8 *scan_mem_for_rsdp(u8 *start, u32 length) +{ + struct acpi_table_rsdp *rsdp; + u8 *address, *end; + + end = start + length; + + /* Search from given start address for the requested length */ + for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) { + /* + * Both RSDP signature and checksum must be correct. + * Note: Sometimes there exists more than one RSDP in memory; + * the valid RSDP has a valid checksum, all others have an + * invalid checksum. + */ + rsdp = (struct acpi_table_rsdp *)address; + + /* BAD Signature */ + if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature)) + continue; + + /* Check the standard checksum */ + if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH)) + continue; + + /* Check extended checksum if table version >= 2 */ + if ((rsdp->revision >= 2) && + (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH))) + continue; + + /* Signature and checksum valid, we have found a real RSDP */ + return address; + } + return NULL; +} + +/* Search RSDP address in EBDA. */ +static acpi_physical_address bios_get_rsdp_addr(void) +{ + unsigned long address; + u8 *rsdp; + + /* Get the location of the Extended BIOS Data Area (EBDA) */ + address = *(u16 *)ACPI_EBDA_PTR_LOCATION; + address <<= 4; + + /* + * Search EBDA paragraphs (EBDA is required to be a minimum of + * 1K length) + */ + if (address > 0x400) { + rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE); + if (rsdp) + return (acpi_physical_address)(unsigned long)rsdp; + } + + /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */ + rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE, + ACPI_HI_RSDP_WINDOW_SIZE); + if (rsdp) + return (acpi_physical_address)(unsigned long)rsdp; + + return 0; +} + +/* Return RSDP address on success, otherwise 0. */ +acpi_physical_address get_rsdp_addr(void) +{ + acpi_physical_address pa; + + pa = get_acpi_rsdp(); + + if (!pa) + pa = boot_params->acpi_rsdp_addr; + + if (!pa) + pa = efi_get_rsdp_addr(); + + if (!pa) + pa = bios_get_rsdp_addr(); + + return pa; +} + +#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) +/* Compute SRAT address from RSDP. */ +static unsigned long get_acpi_srat_table(void) +{ + unsigned long root_table, acpi_table; + struct acpi_table_header *header; + struct acpi_table_rsdp *rsdp; + u32 num_entries, size, len; + char arg[10]; + u8 *entry; + + rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr; + if (!rsdp) + return 0; + + /* Get ACPI root table from RSDP.*/ + if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 && + !strncmp(arg, "rsdt", 4)) && + rsdp->xsdt_physical_address && + rsdp->revision > 1) { + root_table = rsdp->xsdt_physical_address; + size = ACPI_XSDT_ENTRY_SIZE; + } else { + root_table = rsdp->rsdt_physical_address; + size = ACPI_RSDT_ENTRY_SIZE; + } + + if (!root_table) + return 0; + + header = (struct acpi_table_header *)root_table; + len = header->length; + if (len < sizeof(struct acpi_table_header) + size) + return 0; + + num_entries = (len - sizeof(struct acpi_table_header)) / size; + entry = (u8 *)(root_table + sizeof(struct acpi_table_header)); + + while (num_entries--) { + if (size == ACPI_RSDT_ENTRY_SIZE) + acpi_table = *(u32 *)entry; + else + acpi_table = *(u64 *)entry; + + if (acpi_table) { + header = (struct acpi_table_header *)acpi_table; + + if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT)) + return acpi_table; + } + entry += size; + } + return 0; +} + +/** + * count_immovable_mem_regions - Parse SRAT and cache the immovable + * memory regions into the immovable_mem array. + * + * Return the number of immovable memory regions on success, 0 on failure: + * + * - Too many immovable memory regions + * - ACPI off or no SRAT found + * - No immovable memory region found. + */ +int count_immovable_mem_regions(void) +{ + unsigned long table_addr, table_end, table; + struct acpi_subtable_header *sub_table; + struct acpi_table_header *table_header; + char arg[MAX_ACPI_ARG_LENGTH]; + int num = 0; + + if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 && + !strncmp(arg, "off", 3)) + return 0; + + table_addr = get_acpi_srat_table(); + if (!table_addr) + return 0; + + table_header = (struct acpi_table_header *)table_addr; + table_end = table_addr + table_header->length; + table = table_addr + sizeof(struct acpi_table_srat); + + while (table + sizeof(struct acpi_subtable_header) < table_end) { + sub_table = (struct acpi_subtable_header *)table; + if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) { + struct acpi_srat_mem_affinity *ma; + + ma = (struct acpi_srat_mem_affinity *)sub_table; + if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) { + immovable_mem[num].start = ma->base_address; + immovable_mem[num].size = ma->length; + num++; + } + + if (num >= MAX_NUMNODES*2) { + debug_putstr("Too many immovable memory regions, aborting.\n"); + return 0; + } + } + table += sub_table->length; + } + return num; +} +#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */ diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c index af6cda0b7900..f1add5d85da9 100644 --- a/arch/x86/boot/compressed/cmdline.c +++ b/arch/x86/boot/compressed/cmdline.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "misc.h" -#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL - static unsigned long fs; static inline void set_fs(unsigned long seg) { @@ -30,5 +28,3 @@ int cmdline_find_option_bool(const char *option) { return __cmdline_find_option_bool(get_cmd_line_ptr(), option); } - -#endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index f62e347862cc..fafb75c6c592 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -358,8 +358,11 @@ ENTRY(startup_64) * paging_prepare() sets up the trampoline and checks if we need to * enable 5-level paging. * - * Address of the trampoline is returned in RAX. - * Non zero RDX on return means we need to enable 5-level paging. + * paging_prepare() returns a two-quadword structure which lands + * into RDX:RAX: + * - Address of the trampoline is returned in RAX. + * - Non zero RDX means trampoline needs to enable 5-level + * paging. * * RSI holds real mode data and needs to be preserved across * this function call. @@ -565,7 +568,7 @@ adjust_got: * * RDI contains the return address (might be above 4G). * ECX contains the base address of the trampoline memory. - * Non zero RDX on return means we need to enable 5-level paging. + * Non zero RDX means trampoline needs to enable 5-level paging. */ ENTRY(trampoline_32bit_src) /* Set up data and stack segments */ @@ -655,8 +658,6 @@ no_longmode: .data gdt64: .word gdt_end - gdt - .long 0 - .word 0 .quad 0 gdt: .word gdt_end - gdt diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 9ed9709d9947..fa0332dda9f2 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -87,10 +87,6 @@ static unsigned long get_boot_seed(void) #define KASLR_COMPRESSED_BOOT #include "../../lib/kaslr.c" -struct mem_vector { - unsigned long long start; - unsigned long long size; -}; /* Only supporting at most 4 unusable memmap regions with kaslr */ #define MAX_MEMMAP_REGIONS 4 @@ -101,6 +97,8 @@ static bool memmap_too_large; /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */ static unsigned long long mem_limit = ULLONG_MAX; +/* Number of immovable memory regions */ +static int num_immovable_mem; enum mem_avoid_index { MEM_AVOID_ZO_RANGE = 0, @@ -417,6 +415,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size, /* Mark the memmap regions we need to avoid */ handle_mem_options(); + /* Enumerate the immovable memory regions */ + num_immovable_mem = count_immovable_mem_regions(); + #ifdef CONFIG_X86_VERBOSE_BOOTUP /* Make sure video RAM can be used. */ add_identity_map(0, PMD_SIZE); @@ -572,9 +573,9 @@ static unsigned long slots_fetch_random(void) return 0; } -static void process_mem_region(struct mem_vector *entry, - unsigned long minimum, - unsigned long image_size) +static void __process_mem_region(struct mem_vector *entry, + unsigned long minimum, + unsigned long image_size) { struct mem_vector region, overlap; unsigned long start_orig, end; @@ -650,6 +651,56 @@ static void process_mem_region(struct mem_vector *entry, } } +static bool process_mem_region(struct mem_vector *region, + unsigned long long minimum, + unsigned long long image_size) +{ + int i; + /* + * If no immovable memory found, or MEMORY_HOTREMOVE disabled, + * use @region directly. + */ + if (!num_immovable_mem) { + __process_mem_region(region, minimum, image_size); + + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n"); + return 1; + } + return 0; + } + +#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) + /* + * If immovable memory found, filter the intersection between + * immovable memory and @region. + */ + for (i = 0; i < num_immovable_mem; i++) { + unsigned long long start, end, entry_end, region_end; + struct mem_vector entry; + + if (!mem_overlaps(region, &immovable_mem[i])) + continue; + + start = immovable_mem[i].start; + end = start + immovable_mem[i].size; + region_end = region->start + region->size; + + entry.start = clamp(region->start, start, end); + entry_end = clamp(region_end, start, end); + entry.size = entry_end - entry.start; + + __process_mem_region(&entry, minimum, image_size); + + if (slot_area_index == MAX_SLOT_AREA) { + debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n"); + return 1; + } + } + return 0; +#endif +} + #ifdef CONFIG_EFI /* * Returns true if mirror region found (and must have been processed @@ -715,11 +766,8 @@ process_efi_entries(unsigned long minimum, unsigned long image_size) region.start = md->phys_addr; region.size = md->num_pages << EFI_PAGE_SHIFT; - process_mem_region(®ion, minimum, image_size); - if (slot_area_index == MAX_SLOT_AREA) { - debug_putstr("Aborted EFI scan (slot_areas full)!\n"); + if (process_mem_region(®ion, minimum, image_size)) break; - } } return true; } @@ -746,11 +794,8 @@ static void process_e820_entries(unsigned long minimum, continue; region.start = entry->addr; region.size = entry->size; - process_mem_region(®ion, minimum, image_size); - if (slot_area_index == MAX_SLOT_AREA) { - debug_putstr("Aborted e820 scan (slot_areas full)!\n"); + if (process_mem_region(®ion, minimum, image_size)) break; - } } } diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8dd1d5ccae58..c0d6c560df69 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -351,6 +351,9 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, /* Clear flags intended for solely in-kernel use. */ boot_params->hdr.loadflags &= ~KASLR_FLAG; + /* Save RSDP address for later use. */ + boot_params->acpi_rsdp_addr = get_rsdp_addr(); + sanitize_boot_params(boot_params); if (boot_params->screen_info.orig_video_mode == 7) { diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a1d5918765f3..fd13655e0f9b 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -25,6 +25,9 @@ #include <asm/bootparam.h> #include <asm/bootparam_utils.h> +#define BOOT_CTYPE_H +#include <linux/acpi.h> + #define BOOT_BOOT_H #include "../ctype.h" @@ -63,12 +66,14 @@ static inline void debug_puthex(const char *s) #endif -#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); -#endif +struct mem_vector { + unsigned long long start; + unsigned long long size; +}; #if CONFIG_RANDOMIZE_BASE /* kaslr.c */ @@ -116,3 +121,17 @@ static inline void console_init(void) void set_sev_encryption_mask(void); #endif + +/* acpi.c */ +#ifdef CONFIG_ACPI +acpi_physical_address get_rsdp_addr(void); +#else +static inline acpi_physical_address get_rsdp_addr(void) { return 0; } +#endif + +#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI) +extern struct mem_vector immovable_mem[MAX_NUMNODES*2]; +int count_immovable_mem_regions(void); +#else +static inline int count_immovable_mem_regions(void) { return 0; } +#endif diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c index 9e2157371491..f8debf7aeb4c 100644 --- a/arch/x86/boot/compressed/pgtable_64.c +++ b/arch/x86/boot/compressed/pgtable_64.c @@ -1,5 +1,7 @@ +#include <linux/efi.h> #include <asm/e820/types.h> #include <asm/processor.h> +#include <asm/efi.h> #include "pgtable.h" #include "../string.h" @@ -37,9 +39,10 @@ int cmdline_find_option_bool(const char *option); static unsigned long find_trampoline_placement(void) { - unsigned long bios_start, ebda_start; + unsigned long bios_start = 0, ebda_start = 0; unsigned long trampoline_start; struct boot_e820_entry *entry; + char *signature; int i; /* @@ -47,8 +50,18 @@ static unsigned long find_trampoline_placement(void) * This code is based on reserve_bios_regions(). */ - ebda_start = *(unsigned short *)0x40e << 4; - bios_start = *(unsigned short *)0x413 << 10; + /* + * EFI systems may not provide legacy ROM. The memory may not be mapped + * at all. + * + * Only look for values in the legacy ROM for non-EFI system. + */ + signature = (char *)&boot_params->efi_info.efi_loader_signature; + if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) && + strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) { + ebda_start = *(unsigned short *)0x40e << 4; + bios_start = *(unsigned short *)0x413 << 10; + } if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) bios_start = BIOS_START_MAX; diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index f491bbde8493..508cfa6828c5 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm-generic/vmlinux.lds.h> -OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #undef i386 diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 96a6c7563538..0149e41d42c2 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -3,7 +3,7 @@ * * Linker script for the i386 setup code */ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) ENTRY(_start) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index c4428a176973..315a67b8896b 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -13,10 +13,14 @@ */ #include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> #include <asm/asm.h> #include "ctype.h" #include "string.h" +#define KSTRTOX_OVERFLOW (1U << 31) + /* * Undef these macros so that the functions that we provide * here will have the correct names regardless of how string.h @@ -187,3 +191,140 @@ char *strchr(const char *s, int c) return NULL; return (char *)s; } + +static inline u64 __div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) +{ + union { + u64 v64; + u32 v32[2]; + } d = { dividend }; + u32 upper; + + upper = d.v32[1]; + d.v32[1] = 0; + if (upper >= divisor) { + d.v32[1] = upper / divisor; + upper %= divisor; + } + asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) : + "rm" (divisor), "0" (d.v32[0]), "1" (upper)); + return d.v64; +} + +static inline u64 __div_u64(u64 dividend, u32 divisor) +{ + u32 remainder; + + return __div_u64_rem(dividend, divisor, &remainder); +} + +static inline char _tolower(const char c) +{ + return c | 0x20; +} + +static const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) +{ + if (*base == 0) { + if (s[0] == '0') { + if (_tolower(s[1]) == 'x' && isxdigit(s[2])) + *base = 16; + else + *base = 8; + } else + *base = 10; + } + if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x') + s += 2; + return s; +} + +/* + * Convert non-negative integer string representation in explicitly given radix + * to an integer. + * Return number of characters consumed maybe or-ed with overflow bit. + * If overflow occurs, result integer (incorrect) is still returned. + * + * Don't you dare use this function. + */ +static unsigned int _parse_integer(const char *s, + unsigned int base, + unsigned long long *p) +{ + unsigned long long res; + unsigned int rv; + + res = 0; + rv = 0; + while (1) { + unsigned int c = *s; + unsigned int lc = c | 0x20; /* don't tolower() this line */ + unsigned int val; + + if ('0' <= c && c <= '9') + val = c - '0'; + else if ('a' <= lc && lc <= 'f') + val = lc - 'a' + 10; + else + break; + + if (val >= base) + break; + /* + * Check for overflow only if we are within range of + * it in the max base we support (16) + */ + if (unlikely(res & (~0ull << 60))) { + if (res > __div_u64(ULLONG_MAX - val, base)) + rv |= KSTRTOX_OVERFLOW; + } + res = res * base + val; + rv++; + s++; + } + *p = res; + return rv; +} + +static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + unsigned long long _res; + unsigned int rv; + + s = _parse_integer_fixup_radix(s, &base); + rv = _parse_integer(s, base, &_res); + if (rv & KSTRTOX_OVERFLOW) + return -ERANGE; + if (rv == 0) + return -EINVAL; + s += rv; + if (*s == '\n') + s++; + if (*s) + return -EINVAL; + *res = _res; + return 0; +} + +/** + * kstrtoull - convert a string to an unsigned long long + * @s: The start of the string. The string must be null-terminated, and may also + * include a single newline before its terminating null. The first character + * may also be a plus sign, but not a minus sign. + * @base: The number base to use. The maximum supported base is 16. If base is + * given as 0, then the base of the string is automatically detected with the + * conventional semantics - If it begins with 0x the number will be parsed as a + * hexadecimal (case insensitive), if it otherwise begins with 0, it will be + * parsed as an octal number. Otherwise it will be parsed as a decimal. + * @res: Where to write the result of the conversion on success. + * + * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. + * Used as a replacement for the obsolete simple_strtoull. Return code must + * be checked. + */ +int kstrtoull(const char *s, unsigned int base, unsigned long long *res) +{ + if (s[0] == '+') + s++; + return _kstrtoull(s, base, res); +} diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 3d78e27077f4..38d8f2f5e47e 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -29,4 +29,5 @@ extern unsigned int atou(const char *s); extern unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); +int kstrtoull(const char *s, unsigned int base, unsigned long long *res); #endif /* BOOT_STRING_H */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 7ca67c482f4c..9f908112bbb9 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -309,3 +309,5 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_CRYPTO_AES_586=y # CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_EFI_STUB=y +CONFIG_ACPI_BGRT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 5d42a20e0986..1d3badfda09e 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -307,3 +307,6 @@ CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_DISABLE=y # CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_ACPI_BGRT=y diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 17096d3cd616..7ec265bacb6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4220,6 +4220,8 @@ __init int intel_pmu_init(void) case INTEL_FAM6_CORE2_MEROM: x86_add_quirk(intel_clovertown_quirk); + /* fall through */ + case INTEL_FAM6_CORE2_MEROM_L: case INTEL_FAM6_CORE2_PENRYN: case INTEL_FAM6_CORE2_DUNNINGTON: diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index c88ed39582a1..580c1b91c454 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -931,6 +931,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) ret = X86_BR_ZERO_CALL; break; } + /* fall through */ case 0x9a: /* call far absolute */ ret = X86_BR_CALL; break; diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 1908214b9125..ce92c4acc913 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -7,7 +7,6 @@ #include <asm-generic/asm-prototypes.h> -#include <asm/page.h> #include <asm/pgtable.h> #include <asm/special_insns.h> #include <asm/preempt.h> diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fa2c93cb42a2..fb04a3ded7dd 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -137,37 +137,25 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); - else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) + else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); - /* See comment in copy_fxregs_to_kernel() below. */ - return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { - if (IS_ENABLED(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - } else { - if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { - kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - } else { - /* See comment in copy_fxregs_to_kernel() below. */ - kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); - } - } + else + kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); - else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) + else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); - - /* See comment in copy_fxregs_to_kernel() below. */ - return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), - "m" (*fx)); } static inline void copy_kernel_to_fregs(struct fregs_state *fx) @@ -184,34 +172,8 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); - else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) + else asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); - else { - /* Using "rex64; fxsave %0" is broken because, if the memory - * operand uses any extended registers for addressing, a second - * REX prefix will be generated (to the assembler, rex64 - * followed by semicolon is a separate instruction), and hence - * the 64-bitness is lost. - * - * Using "fxsaveq %0" would be the ideal choice, but is only - * supported starting with gas 2.16. - * - * Using, as a workaround, the properly prefixed form below - * isn't accepted by any binutils version so far released, - * complaining that the same type of prefix is used twice if - * an extended register is needed for addressing (fix submitted - * to mainline 2005-11-21). - * - * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave)); - * - * This, however, we can work around by forcing the compiler to - * select an addressing mode that doesn't require extended - * registers. - */ - asm volatile( "rex64/fxsave (%[fx])" - : "=m" (fpu->state.fxsave) - : [fx] "R" (&fpu->state.fxsave)); - } } /* These macros all use (%edi)/(%rdi) as the single memory argument. */ @@ -414,6 +376,13 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { copy_xregs_to_kernel(&fpu->state.xsave); + + /* + * AVX512 state is tracked here because its use is + * known to slow the max clock speed of the core. + */ + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + fpu->avx512_timestamp = jiffies; return 1; } diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 202c53918ecf..2e32e178e064 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -303,6 +303,13 @@ struct fpu { unsigned char initialized; /* + * @avx512_timestamp: + * + * Records the timestamp of AVX512 use during last context switch. + */ + unsigned long avx512_timestamp; + + /* * @state: * * In-memory copy of all FPU registers that we save/restore diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 9c85b54bf03c..0bb566315621 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -259,8 +259,7 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #define gup_fast_permitted gup_fast_permitted -static inline bool gup_fast_permitted(unsigned long start, int nr_pages, - int write) +static inline bool gup_fast_permitted(unsigned long start, int nr_pages) { unsigned long len, end; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 33051436c864..2bb3a648fc12 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -742,7 +742,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, extern void enable_sep_cpu(void); extern int sysenter_setup(void); -void early_trap_pf_init(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 62004d22524a..1954dd5552a2 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -34,10 +34,7 @@ static inline void set_fs(mm_segment_t fs) } #define segment_eq(a, b) ((a).seg == (b).seg) - #define user_addr_max() (current->thread.addr_limit.seg) -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < user_addr_max()) /* * Test whether a block of memory is a valid user space address. diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 3f697a9e3f59..8cfccc3cbbf4 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -141,7 +141,6 @@ enum uv_memprotect { */ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); @@ -152,11 +151,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); -#ifdef CONFIG_EFI extern void uv_bios_init(void); -#else -void uv_bios_init(void) { } -#endif extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 0c26b1b44e51..4203d4f0c68d 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -90,7 +90,7 @@ ret_point: .data ALIGN ENTRY(saved_magic) .long 0 -ENTRY(saved_eip) .long 0 +saved_eip: .long 0 # saved registers saved_idt: .long 0,0 diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 50b8ed0317a3..510fa12aab73 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -125,12 +125,12 @@ ENTRY(do_suspend_lowlevel) ENDPROC(do_suspend_lowlevel) .data -ENTRY(saved_rbp) .quad 0 -ENTRY(saved_rsi) .quad 0 -ENTRY(saved_rdi) .quad 0 -ENTRY(saved_rbx) .quad 0 +saved_rbp: .quad 0 +saved_rsi: .quad 0 +saved_rdi: .quad 0 +saved_rbx: .quad 0 -ENTRY(saved_rip) .quad 0 -ENTRY(saved_rsp) .quad 0 +saved_rip: .quad 0 +saved_rsp: .quad 0 ENTRY(saved_magic) .quad 0 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2953bbf05c08..264e3221d923 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -812,6 +812,7 @@ static int irq_polarity(int idx) return IOAPIC_POL_HIGH; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); + /* fall through */ case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_POL_LOW; @@ -859,6 +860,7 @@ static int irq_trigger(int idx) return IOAPIC_EDGE; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); + /* fall through */ case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ return IOAPIC_LEVEL; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c4d1023fb0ab..395d46f78582 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -248,6 +248,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, switch (leaf) { case 1: l1 = &l1i; + /* fall through */ case 0: if (!l1->val) return; diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 3668c5df90c6..5bd011737272 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -296,7 +296,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, unsigned long sizek) { unsigned long hole_basek, hole_sizek; - unsigned long second_basek, second_sizek; + unsigned long second_sizek; unsigned long range0_basek, range0_sizek; unsigned long range_basek, range_sizek; unsigned long chunk_sizek; @@ -304,7 +304,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, hole_basek = 0; hole_sizek = 0; - second_basek = 0; second_sizek = 0; chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 14bed6af8377..604c0e3bcc83 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -34,13 +34,6 @@ #include "pseudo_lock_event.h" /* - * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware - * prefetcher state. Details about this register can be found in the MSR - * tables for specific platforms found in Intel's SDM. - */ -#define MSR_MISC_FEATURE_CONTROL 0x000001a4 - -/* * The bits needed to disable hardware prefetching varies based on the * platform. During initialization we will discover which bits to use. */ diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 50895c2f937d..a687d10da417 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -671,21 +671,18 @@ __init void e820__reallocate_tables(void) int size; size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries; - n = kmalloc(size, GFP_KERNEL); + n = kmemdup(e820_table, size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_table, size); e820_table = n; size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries; - n = kmalloc(size, GFP_KERNEL); + n = kmemdup(e820_table_kexec, size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_table_kexec, size); e820_table_kexec = n; size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries; - n = kmalloc(size, GFP_KERNEL); + n = kmemdup(e820_table_firmware, size, GFP_KERNEL); BUG_ON(!n); - memcpy(n, e820_table_firmware, size); e820_table_firmware = n; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9cc108456d0b..d7432c2b1051 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -669,7 +669,7 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size) return false; } -static int init_xstate_size(void) +static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int possible_xstate_size; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 34a5c1715148..ff9bfd40429e 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -261,12 +261,8 @@ static int arch_build_bp_info(struct perf_event *bp, * allow kernel breakpoints at all. */ if (attr->bp_addr >= TASK_SIZE_MAX) { -#ifdef CONFIG_KPROBES if (within_kprobe_blacklist(attr->bp_addr)) return -EINVAL; -#else - return -EINVAL; -#endif } hw->type = X86_BREAKPOINT_EXECUTE; @@ -279,6 +275,7 @@ static int arch_build_bp_info(struct perf_event *bp, hw->len = X86_BREAKPOINT_LEN_X; return 0; } + /* fall through */ default: return -EINVAL; } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 53917a3ebf94..1f3b77367948 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -218,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->screen_info.ext_mem_k = 0; params->alt_mem_k = 0; + /* Always fill in RSDP: it is either 0 or a valid value */ + params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr; + /* Default APM info */ memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); @@ -256,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, efi_setup_data_offset); #endif - /* Setup EDD info */ memcpy(params->eddbuf, boot_params.eddbuf, EDDMAXNR * sizeof(struct edd_info)); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5db08425063e..4ff6b4cdb941 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -467,6 +467,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ptr = &remcomInBuffer[1]; if (kgdb_hex2long(&ptr, &addr)) linux_regs->ip = addr; + /* fall through */ case 'D': case 'k': /* clear the trace bit */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4c8acdfdc5a7..ceba408ea982 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -352,6 +352,8 @@ void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { + u64 sme_mask = sme_me_mask; + VMCOREINFO_NUMBER(phys_base); VMCOREINFO_SYMBOL(init_top_pgt); vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n", @@ -364,6 +366,7 @@ void arch_crash_save_vmcoreinfo(void) vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); + VMCOREINFO_NUMBER(sme_mask); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c91ff9f9fe8a..ce1a67b70168 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -150,7 +150,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ static void smp_callin(void) { - int cpuid, phys_id; + int cpuid; /* * If waken up by an INIT in an 82489DX configuration @@ -161,11 +161,6 @@ static void smp_callin(void) cpuid = smp_processor_id(); /* - * (This works even if the APIC is not enabled.) - */ - phys_id = read_apic_id(); - - /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this * CPU, first the APIC. (this is probably redundant on most diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e289ce1332ab..d26f9e9c3d83 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -881,12 +881,12 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { - unsigned long cr0; + unsigned long cr0 = read_cr0(); RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION - if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) { + if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; cond_local_irq_enable(regs); @@ -898,7 +898,6 @@ do_device_not_available(struct pt_regs *regs, long error_code) #endif /* This should not happen. */ - cr0 = read_cr0(); if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { /* Try to fix it up and carry on. */ write_cr0(cr0 & ~X86_CR0_TS); diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 843feb94a950..ccf03416e434 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -745,6 +745,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * OPCODE1() of the "short" jmp which checks the same condition. */ opc1 = OPCODE2(insn) - 0x10; + /* fall through */ default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0d618ee634ac..bad8c51fee6e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -31,7 +31,7 @@ #undef i386 /* in case the preprocessor is a 32bit one */ -OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) @@ -401,7 +401,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 9119d8e41f1f..cf00ab6c6621 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -179,6 +179,8 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off) if (insn->addr_bytes == 2) return -EINVAL; + /* fall through */ + case -EDOM: case offsetof(struct pt_regs, bx): case offsetof(struct pt_regs, si): diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 12d7e7fb4efd..19c6abf9ea31 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -52,7 +52,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(int cpu) { #ifdef CONFIG_CPU_SUP_INTEL int npages; diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e3cdc85ce5b6..ee8f8ab46941 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, int i; pud_t *start, *pud_start; pgprotval_t prot, eff; - pud_t *prev_pud = NULL; pud_start = start = (pud_t *)p4d_page_vaddr(addr); @@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, } else note_page(m, st, __pgprot(0), 0, 3); - prev_pud = start; start++; } } diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 999d6d8f0bef..bc4bc7b2f075 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ - unsigned int cpu; - - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) smp_call_function_many(cpumask, flush_tlb_func_remote, diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index eb33432f2f24..ef60d789c76e 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -45,7 +45,7 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI * callback method, which uses efi_call() directly, with the kernel page tables: */ - if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags))) + if (unlikely(efi_enabled(EFI_OLD_MEMMAP))) ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); else ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); @@ -85,18 +85,6 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) -{ - s64 ret; - - preempt_disable(); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); - preempt_enable(); - - return ret; -} - long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); @@ -207,7 +195,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) } EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); -#ifdef CONFIG_EFI void uv_bios_init(void) { uv_systab = NULL; @@ -237,4 +224,3 @@ void uv_bios_init(void) } pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); } -#endif diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index a4130b84d1ff..2c53b0f19329 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2010,8 +2010,7 @@ static void make_per_cpu_thp(struct bau_control *smaster) int cpu; size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); - smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); - memset(smaster->thp, 0, hpsz); + smaster->thp = kzalloc_node(hpsz, GFP_KERNEL, smaster->osnode); for_each_present_cpu(cpu) { smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; @@ -2135,15 +2134,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs, static int __init init_per_cpu(int nuvhubs, int base_part_pnode) { unsigned char *uvhub_mask; - void *vp; struct uvhub_desc *uvhub_descs; if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) timeout_us = calculate_destination_timeout(); - vp = kmalloc_array(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); - uvhub_descs = (struct uvhub_desc *)vp; - memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 4463fa72db94..96cb20de08af 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h -LDFLAGS_realmode.elf := --emit-relocs -T +LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) targets += realmode.elf diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index df8e11e26bc3..3bb980800c58 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -9,7 +9,7 @@ #undef i386 -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) SECTIONS diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 963986a48c62..bacf87ee7975 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -5,6 +5,8 @@ config XTENSA select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_NO_COHERENT_DMA_MMAP if !MMU + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 809f39ce08c0..d939e13e8d84 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -23,6 +23,8 @@ generic-y += mm-arch-hooks.h generic-y += param.h generic-y += percpu.h generic-y += preempt.h +generic-y += qrwlock.h +generic-y += qspinlock.h generic-y += rwsem.h generic-y += sections.h generic-y += socket.h diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h index 201e9009efd8..22a10c715c1f 100644 --- a/arch/xtensa/include/asm/cmpxchg.h +++ b/arch/xtensa/include/asm/cmpxchg.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ +#include <linux/bits.h> #include <linux/stringify.h> /* @@ -138,6 +139,28 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val) #define xchg(ptr,x) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) +static inline u32 xchg_small(volatile void *ptr, u32 x, int size) +{ + int off = (unsigned long)ptr % sizeof(u32); + volatile u32 *p = ptr - off; +#ifdef __BIG_ENDIAN + int bitoff = (sizeof(u32) - size - off) * BITS_PER_BYTE; +#else + int bitoff = off * BITS_PER_BYTE; +#endif + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; + u32 oldv, newv; + u32 ret; + + do { + oldv = READ_ONCE(*p); + ret = (oldv & bitmask) >> bitoff; + newv = (oldv & ~bitmask) | (x << bitoff); + } while (__cmpxchg_u32(p, oldv, newv) != oldv); + + return ret; +} + /* * This only works if the compiler isn't horribly bad at optimizing. * gcc-2.5.8 reportedly can't handle this, but I define that one to @@ -150,11 +173,16 @@ static __inline__ unsigned long __xchg(unsigned long x, volatile void * ptr, int size) { switch (size) { - case 4: - return xchg_u32(ptr, x); + case 1: + return xchg_small(ptr, x, 1); + case 2: + return xchg_small(ptr, x, 2); + case 4: + return xchg_u32(ptr, x); + default: + __xchg_called_with_bad_pointer(); + return x; } - __xchg_called_with_bad_pointer(); - return x; } #endif /* __ASSEMBLY__ */ diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index c6e1290dcbb7..584b0de6f2ca 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -12,188 +12,9 @@ #define _XTENSA_SPINLOCK_H #include <asm/barrier.h> -#include <asm/processor.h> +#include <asm/qrwlock.h> +#include <asm/qspinlock.h> -/* - * spinlock - * - * There is at most one owner of a spinlock. There are not different - * types of spinlock owners like there are for rwlocks (see below). - * - * When trying to obtain a spinlock, the function "spins" forever, or busy- - * waits, until the lock is obtained. When spinning, presumably some other - * owner will soon give up the spinlock making it available to others. Use - * the trylock functions to avoid spinning forever. - * - * possible values: - * - * 0 nobody owns the spinlock - * 1 somebody owns the spinlock - */ - -#define arch_spin_is_locked(x) ((x)->slock != 0) - -static inline void arch_spin_lock(arch_spinlock_t *lock) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " wsr %0, scompare1\n" - "1: movi %0, 1\n" - " s32c1i %0, %1, 0\n" - " bnez %0, 1b\n" - : "=&a" (tmp) - : "a" (&lock->slock) - : "memory"); -} - -/* Returns 1 if the lock is obtained, 0 otherwise. */ - -static inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " wsr %0, scompare1\n" - " movi %0, 1\n" - " s32c1i %0, %1, 0\n" - : "=&a" (tmp) - : "a" (&lock->slock) - : "memory"); - - return tmp == 0 ? 1 : 0; -} - -static inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " s32ri %0, %1, 0\n" - : "=&a" (tmp) - : "a" (&lock->slock) - : "memory"); -} - -/* - * rwlock - * - * Read-write locks are really a more flexible spinlock. They allow - * multiple readers but only one writer. Write ownership is exclusive - * (i.e., all other readers and writers are blocked from ownership while - * there is a write owner). These rwlocks are unfair to writers. Writers - * can be starved for an indefinite time by readers. - * - * possible values: - * - * 0 nobody owns the rwlock - * >0 one or more readers own the rwlock - * (the positive value is the actual number of readers) - * 0x80000000 one writer owns the rwlock, no other writers, no readers - */ - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " wsr %0, scompare1\n" - "1: movi %0, 1\n" - " slli %0, %0, 31\n" - " s32c1i %0, %1, 0\n" - " bnez %0, 1b\n" - : "=&a" (tmp) - : "a" (&rw->lock) - : "memory"); -} - -/* Returns 1 if the lock is obtained, 0 otherwise. */ - -static inline int arch_write_trylock(arch_rwlock_t *rw) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " wsr %0, scompare1\n" - " movi %0, 1\n" - " slli %0, %0, 31\n" - " s32c1i %0, %1, 0\n" - : "=&a" (tmp) - : "a" (&rw->lock) - : "memory"); - - return tmp == 0 ? 1 : 0; -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - unsigned long tmp; - - __asm__ __volatile__( - " movi %0, 0\n" - " s32ri %0, %1, 0\n" - : "=&a" (tmp) - : "a" (&rw->lock) - : "memory"); -} - -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - unsigned long tmp; - unsigned long result; - - __asm__ __volatile__( - "1: l32i %1, %2, 0\n" - " bltz %1, 1b\n" - " wsr %1, scompare1\n" - " addi %0, %1, 1\n" - " s32c1i %0, %2, 0\n" - " bne %0, %1, 1b\n" - : "=&a" (result), "=&a" (tmp) - : "a" (&rw->lock) - : "memory"); -} - -/* Returns 1 if the lock is obtained, 0 otherwise. */ - -static inline int arch_read_trylock(arch_rwlock_t *rw) -{ - unsigned long result; - unsigned long tmp; - - __asm__ __volatile__( - " l32i %1, %2, 0\n" - " addi %0, %1, 1\n" - " bltz %0, 1f\n" - " wsr %1, scompare1\n" - " s32c1i %0, %2, 0\n" - " sub %0, %0, %1\n" - "1:\n" - : "=&a" (result), "=&a" (tmp) - : "a" (&rw->lock) - : "memory"); - - return result == 0; -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - unsigned long tmp1, tmp2; - - __asm__ __volatile__( - "1: l32i %1, %2, 0\n" - " addi %0, %1, -1\n" - " wsr %1, scompare1\n" - " s32c1i %0, %2, 0\n" - " bne %0, %1, 1b\n" - : "=&a" (tmp1), "=&a" (tmp2) - : "a" (&rw->lock) - : "memory"); -} +#define smp_mb__after_spinlock() smp_mb() #endif /* _XTENSA_SPINLOCK_H */ diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index bb1fe6c1816e..64c9389254f1 100644 --- a/arch/xtensa/include/asm/spinlock_types.h +++ b/arch/xtensa/include/asm/spinlock_types.h @@ -2,20 +2,11 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) # error "please don't include this file directly" #endif -typedef struct { - volatile unsigned int slock; -} arch_spinlock_t; - -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } - -typedef struct { - volatile unsigned int lock; -} arch_rwlock_t; - -#define __ARCH_RW_LOCK_UNLOCKED { 0 } +#include <asm-generic/qspinlock_types.h> +#include <asm-generic/qrwlock_types.h> #endif diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index f333f10a7650..f092cc3f4e66 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -121,15 +121,6 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_WORK_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_SYSCALL_TRACEPOINT) -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ - #define THREAD_SIZE KERNEL_STACK_SIZE #define THREAD_SIZE_ORDER (KERNEL_STACK_SHIFT - PAGE_SHIFT) diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 74969a437a37..db278a9e80c7 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -52,8 +52,6 @@ extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); -struct task_struct *current_set[NR_CPUS] = {&init_task, }; - void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); @@ -321,8 +319,8 @@ unsigned long get_wchan(struct task_struct *p) /* Stack layout: sp-4: ra, sp-3: sp' */ - pc = MAKE_PC_FROM_RA(*(unsigned long*)sp - 4, sp); - sp = *(unsigned long *)sp - 3; + pc = MAKE_PC_FROM_RA(SPILL_SLOT(sp, 0), sp); + sp = SPILL_SLOT(sp, 1); } while (count++ < 16); return 0; } diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c index be1f280c322c..3699d6d3e479 100644 --- a/arch/xtensa/kernel/smp.c +++ b/arch/xtensa/kernel/smp.c @@ -372,8 +372,7 @@ static void send_ipi_message(const struct cpumask *callmask, unsigned long mask = 0; for_each_cpu(index, callmask) - if (index != smp_processor_id()) - mask |= 1 << index; + mask |= 1 << index; set_er(mask, MIPISET(msg_id)); } @@ -412,22 +411,31 @@ irqreturn_t ipi_interrupt(int irq, void *dev_id) { unsigned int cpu = smp_processor_id(); struct ipi_data *ipi = &per_cpu(ipi_data, cpu); - unsigned int msg; - unsigned i; - msg = get_er(MIPICAUSE(cpu)); - for (i = 0; i < IPI_MAX; i++) - if (msg & (1 << i)) { - set_er(1 << i, MIPICAUSE(cpu)); - ++ipi->ipi_count[i]; + for (;;) { + unsigned int msg; + + msg = get_er(MIPICAUSE(cpu)); + set_er(msg, MIPICAUSE(cpu)); + + if (!msg) + break; + + if (msg & (1 << IPI_CALL_FUNC)) { + ++ipi->ipi_count[IPI_CALL_FUNC]; + generic_smp_call_function_interrupt(); } - if (msg & (1 << IPI_RESCHEDULE)) - scheduler_ipi(); - if (msg & (1 << IPI_CALL_FUNC)) - generic_smp_call_function_interrupt(); - if (msg & (1 << IPI_CPU_STOP)) - ipi_cpu_stop(cpu); + if (msg & (1 << IPI_RESCHEDULE)) { + ++ipi->ipi_count[IPI_RESCHEDULE]; + scheduler_ipi(); + } + + if (msg & (1 << IPI_CPU_STOP)) { + ++ipi->ipi_count[IPI_CPU_STOP]; + ipi_cpu_stop(cpu); + } + } return IRQ_HANDLED; } diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 378186b5eb40..69db8c93c1f9 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -52,14 +52,11 @@ static struct clocksource ccount_clocksource = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int ccount_timer_set_next_event(unsigned long delta, - struct clock_event_device *dev); struct ccount_timer { struct clock_event_device evt; int irq_enabled; char name[24]; }; -static DEFINE_PER_CPU(struct ccount_timer, ccount_timer); static int ccount_timer_set_next_event(unsigned long delta, struct clock_event_device *dev) @@ -107,7 +104,30 @@ static int ccount_timer_set_oneshot(struct clock_event_device *evt) return 0; } -static irqreturn_t timer_interrupt(int irq, void *dev_id); +static DEFINE_PER_CPU(struct ccount_timer, ccount_timer) = { + .evt = { + .features = CLOCK_EVT_FEAT_ONESHOT, + .rating = 300, + .set_next_event = ccount_timer_set_next_event, + .set_state_shutdown = ccount_timer_shutdown, + .set_state_oneshot = ccount_timer_set_oneshot, + .tick_resume = ccount_timer_set_oneshot, + }, +}; + +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt; + + set_linux_timer(get_linux_timer()); + evt->event_handler(evt); + + /* Allow platform to do something useful (Wdog). */ + platform_heartbeat(); + + return IRQ_HANDLED; +} + static struct irqaction timer_irqaction = { .handler = timer_interrupt, .flags = IRQF_TIMER, @@ -120,14 +140,8 @@ void local_timer_setup(unsigned cpu) struct clock_event_device *clockevent = &timer->evt; timer->irq_enabled = 1; - clockevent->name = timer->name; snprintf(timer->name, sizeof(timer->name), "ccount_clockevent_%u", cpu); - clockevent->features = CLOCK_EVT_FEAT_ONESHOT; - clockevent->rating = 300; - clockevent->set_next_event = ccount_timer_set_next_event; - clockevent->set_state_shutdown = ccount_timer_shutdown; - clockevent->set_state_oneshot = ccount_timer_set_oneshot; - clockevent->tick_resume = ccount_timer_set_oneshot; + clockevent->name = timer->name; clockevent->cpumask = cpumask_of(cpu); clockevent->irq = irq_create_mapping(NULL, LINUX_TIMER_INT); if (WARN(!clockevent->irq, "error: can't map timer irq")) @@ -190,23 +204,6 @@ void __init time_init(void) timer_probe(); } -/* - * The timer interrupt is called HZ times per second. - */ - -irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = &this_cpu_ptr(&ccount_timer)->evt; - - set_linux_timer(get_linux_timer()); - evt->event_handler(evt); - - /* Allow platform to do something useful (Wdog). */ - platform_heartbeat(); - - return IRQ_HANDLED; -} - #ifndef CONFIG_GENERIC_CALIBRATE_DELAY void calibrate_delay(void) { diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index e6fa55aa1ccb..454d53096bc9 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -420,16 +420,15 @@ void __init trap_init(void) /* Setup specific handlers. */ for(i = 0; dispatch_init_table[i].cause >= 0; i++) { - int fast = dispatch_init_table[i].fast; int cause = dispatch_init_table[i].cause; void *handler = dispatch_init_table[i].handler; if (fast == 0) set_handler(default_handler, cause, handler); - if (fast && fast & USER) + if ((fast & USER) != 0) set_handler(fast_user_handler, cause, handler); - if (fast && fast & KRNL) + if ((fast & KRNL) != 0) set_handler(fast_kernel_handler, cause, handler); } |