diff options
Diffstat (limited to 'arch/powerpc')
71 files changed, 2295 insertions, 919 deletions
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index a314cb024c8b..05f464eb6952 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -66,7 +66,10 @@ endif UTS_MACHINE := $(OLDARCH) ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) -override CC += -mlittle-endian -mno-strict-align +override CC += -mlittle-endian +ifneq ($(COMPILER),clang) +override CC += -mno-strict-align +endif override AS += -mlittle-endian override LD += -EL override CROSS32CC += -mlittle-endian @@ -113,14 +116,14 @@ else endif endif -CFLAGS-$(CONFIG_PPC64) := -mtraceback=no +CFLAGS-$(CONFIG_PPC64) := $(call cc-option,-mtraceback=no) ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) -CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2,-mcall-aixdesc) +CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2,$(call cc-option,-mcall-aixdesc)) AFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv2) else -CFLAGS-$(CONFIG_PPC64) += -mcall-aixdesc +CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcall-aixdesc) endif -CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,-mminimal-toc) +CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,$(call cc-option,-mminimal-toc)) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions) CFLAGS-$(CONFIG_PPC32) := -ffixed-r2 $(MULTIPLEWORD) @@ -160,7 +163,8 @@ asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1) KBUILD_CPPFLAGS += -Iarch/$(ARCH) $(asinstr) KBUILD_AFLAGS += -Iarch/$(ARCH) $(AFLAGS-y) -KBUILD_CFLAGS += -msoft-float -pipe -Iarch/$(ARCH) $(CFLAGS-y) +KBUILD_CFLAGS += $(call cc-option,-msoft-float) +KBUILD_CFLAGS += -pipe -Iarch/$(ARCH) $(CFLAGS-y) CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__ @@ -192,7 +196,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-dwarf2-cfi-asm) # Never use string load/store instructions as they are # often slow when they are implemented at all -KBUILD_CFLAGS += -mno-string +KBUILD_CFLAGS += $(call cc-option,-mno-string) ifeq ($(CONFIG_6xx),y) KBUILD_CFLAGS += -mcpu=powerpc @@ -269,6 +273,21 @@ bootwrapper_install: %.dtb: scripts $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) +# Used to create 'merged defconfigs' +# To use it $(call) it with the first argument as the base defconfig +# and the second argument as a space separated list of .config files to merge, +# without the .config suffix. +define merge_into_defconfig + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \ + -m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/$(1) \ + $(foreach config,$(2),$(srctree)/arch/$(ARCH)/configs/$(config).config) + +$(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig +endef + +PHONY += pseries_le_defconfig +pseries_le_defconfig: + $(call merge_into_defconfig,pseries_defconfig,le) + define archhelp @echo '* zImage - Build default images selected by kernel config' @echo ' zImage.* - Compressed kernel image (arch/$(ARCH)/boot/zImage.*)' diff --git a/arch/powerpc/configs/le.config b/arch/powerpc/configs/le.config new file mode 100644 index 000000000000..ee43fdb3b8f4 --- /dev/null +++ b/arch/powerpc/configs/le.config @@ -0,0 +1 @@ +CONFIG_CPU_LITTLE_ENDIAN=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index aad501ae3834..a97efc2146fd 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -155,6 +155,7 @@ CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y CONFIG_PCNET32=y CONFIG_TIGON3=y +CONFIG_BNX2X=m CONFIG_CHELSIO_T1=m CONFIG_BE2NET=m CONFIG_S2IO=m diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index c2e39f66b182..0d9efcedaf34 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -154,6 +154,7 @@ CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y CONFIG_PCNET32=y CONFIG_TIGON3=y +CONFIG_BNX2X=m CONFIG_CHELSIO_T1=m CONFIG_BE2NET=m CONFIG_S2IO=m @@ -297,7 +298,6 @@ CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y CONFIG_XMON=y -CONFIG_XMON_DEFAULT=y CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig deleted file mode 100644 index 09bc96e792cd..000000000000 --- a/arch/powerpc/configs/pseries_le_defconfig +++ /dev/null @@ -1,319 +0,0 @@ -CONFIG_PPC64=y -CONFIG_SMP=y -CONFIG_NR_CPUS=2048 -CONFIG_CPU_LITTLE_ENDIAN=y -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_FHANDLE=y -CONFIG_AUDIT=y -CONFIG_AUDITSYSCALL=y -CONFIG_IRQ_DOMAIN_DEBUG=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CPUSETS=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_SCHED=y -CONFIG_USER_NS=y -CONFIG_BLK_DEV_INITRD=y -# CONFIG_COMPAT_BRK is not set -CONFIG_PROFILING=y -CONFIG_OPROFILE=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_PPC_SPLPAR=y -CONFIG_SCANLOG=m -CONFIG_PPC_SMLPAR=y -CONFIG_DTL=y -# CONFIG_PPC_PMAC is not set -CONFIG_RTAS_FLASH=m -CONFIG_IBMEBUS=y -CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y -CONFIG_HZ_100=y -CONFIG_BINFMT_MISC=m -CONFIG_PPC_TRANSACTIONAL_MEM=y -CONFIG_KEXEC=y -CONFIG_IRQ_ALL_CPUS=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_KSM=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_PPC_64K_PAGES=y -CONFIG_PPC_SUBPAGE_PROT=y -CONFIG_SCHED_SMT=y -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_RPA=m -CONFIG_HOTPLUG_PCI_RPA_DLPAR=m -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_XFRM_USER=m -CONFIG_NET_KEY=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_NET_IPIP=y -CONFIG_SYN_COOKIES=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -# CONFIG_IPV6 is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_ADVANCED is not set -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_BLK_DEV_FD=m -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=65536 -CONFIG_VIRTIO_BLK=m -CONFIG_IDE=y -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_BLK_DEV_AMD74XX=y -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=y -CONFIG_BLK_DEV_SR=y -CONFIG_BLK_DEV_SR_VENDOR=y -CONFIG_CHR_DEV_SG=y -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_FC_ATTRS=y -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_BE2ISCSI=m -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_IBMVSCSI=y -CONFIG_SCSI_IBMVFC=m -CONFIG_SCSI_SYM53C8XX_2=y -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 -CONFIG_SCSI_IPR=y -CONFIG_SCSI_QLA_FC=m -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_SCSI_LPFC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_DH=m -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_ALUA=m -CONFIG_ATA=y -CONFIG_SATA_AHCI=y -# CONFIG_ATA_SFF is not set -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=y -CONFIG_MD_RAID0=y -CONFIG_MD_RAID1=y -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_BLK_DEV_DM=y -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_MIRROR=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_UEVENT=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_VXLAN=m -CONFIG_NETCONSOLE=y -CONFIG_TUN=m -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_VHOST_NET=m -CONFIG_VORTEX=y -CONFIG_ACENIC=m -CONFIG_ACENIC_OMIT_TIGON_I=y -CONFIG_PCNET32=y -CONFIG_TIGON3=y -CONFIG_CHELSIO_T1=m -CONFIG_BE2NET=m -CONFIG_S2IO=m -CONFIG_IBMVETH=y -CONFIG_EHEA=y -CONFIG_E100=y -CONFIG_E1000=y -CONFIG_E1000E=y -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_MLX4_EN=m -CONFIG_MYRI10GE=m -CONFIG_QLGE=m -CONFIG_NETXEN_NIC=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPPOE=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set -CONFIG_INPUT_EVDEV=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_PCSPKR=m -# CONFIG_SERIO_SERPORT is not set -CONFIG_DEVPTS_MULTIPLE_INSTANCES=y -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_ICOM=m -CONFIG_SERIAL_JSM=m -CONFIG_HVC_CONSOLE=y -CONFIG_HVC_RTAS=y -CONFIG_HVCS=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IBM_BSR=m -CONFIG_GEN_RTC=y -CONFIG_RAW_DRIVER=y -CONFIG_MAX_RAW_DEVS=1024 -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_OF=y -CONFIG_FB_MATROX=y -CONFIG_FB_MATROX_MILLENIUM=y -CONFIG_FB_MATROX_MYSTIQUE=y -CONFIG_FB_MATROX_G=y -CONFIG_FB_RADEON=y -CONFIG_FB_IBM_GXT4500=y -CONFIG_LCD_PLATFORM=m -# CONFIG_VGA_CONSOLE is not set -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_LOGO=y -CONFIG_HID_GYRATION=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SUNPLUS=y -CONFIG_USB_HIDDEV=y -CONFIG_USB=y -CONFIG_USB_MON=m -CONFIG_USB_EHCI_HCD=y -# CONFIG_USB_EHCI_HCD_PPC_OF is not set -CONFIG_USB_OHCI_HCD=y -CONFIG_USB_STORAGE=m -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_EHCA=m -CONFIG_INFINIBAND_CXGB3=m -CONFIG_INFINIBAND_CXGB4=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_ISER=m -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT2_FS_XIP=y -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_REISERFS_FS=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -CONFIG_XFS_FS=m -CONFIG_XFS_POSIX_ACL=y -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -CONFIG_NILFS2_FS=m -CONFIG_AUTOFS4_FS=m -CONFIG_FUSE_FS=m -CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=y -CONFIG_UDF_FS=m -CONFIG_MSDOS_FS=y -CONFIG_VFAT_FS=y -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_HUGETLBFS=y -CONFIG_CRAMFS=m -CONFIG_SQUASHFS=m -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_PSTORE=y -CONFIG_NFS_FS=y -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=y -CONFIG_NFSD=m -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_CIFS=m -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=y -CONFIG_NLS_ISO8859_1=y -CONFIG_NLS_UTF8=y -CONFIG_MAGIC_SYSRQ=y -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_STACK_USAGE=y -CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_LOCKUP_DETECTOR=y -CONFIG_LATENCYTOP=y -CONFIG_SCHED_TRACER=y -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_CODE_PATCHING_SELFTEST=y -CONFIG_FTR_FIXUP_SELFTEST=y -CONFIG_MSI_BITMAP_SELFTEST=y -CONFIG_XMON=y -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_LZO=m -# CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_VIRTUALIZATION=y -CONFIG_KVM_BOOK3S_64=m -CONFIG_KVM_BOOK3S_64_HV=m diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index ae1fa65bb26d..b118072670fb 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -242,11 +242,13 @@ enum { /* We only set the TM feature if the kernel was compiled with TM supprt */ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -#define CPU_FTR_TM_COMP CPU_FTR_TM -#define PPC_FEATURE2_HTM_COMP PPC_FEATURE2_HTM +#define CPU_FTR_TM_COMP CPU_FTR_TM +#define PPC_FEATURE2_HTM_COMP PPC_FEATURE2_HTM +#define PPC_FEATURE2_HTM_NOSC_COMP PPC_FEATURE2_HTM_NOSC #else -#define CPU_FTR_TM_COMP 0 -#define PPC_FEATURE2_HTM_COMP 0 +#define CPU_FTR_TM_COMP 0 +#define PPC_FEATURE2_HTM_COMP 0 +#define PPC_FEATURE2_HTM_NOSC_COMP 0 #endif /* We need to mark all pages as being coherent if we're SMP or we have a diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 9f1371bab5fc..e9bdda88f1fb 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -46,6 +46,9 @@ struct dev_archdata { #ifdef CONFIG_FAIL_IOMMU int fail_iommu; #endif +#ifdef CONFIG_CXL_BASE + struct cxl_context *cxl_ctx; +#endif }; struct pdev_archdata { diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 1e27d6338565..ca18cff90900 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -44,6 +44,39 @@ extern int iommu_is_off; extern int iommu_force_on; +struct iommu_table_ops { + /* + * When called with direction==DMA_NONE, it is equal to clear(). + * uaddr is a linear map address. + */ + int (*set)(struct iommu_table *tbl, + long index, long npages, + unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs); +#ifdef CONFIG_IOMMU_API + /* + * Exchanges existing TCE with new TCE plus direction bits; + * returns old TCE and DMA direction mask. + * @tce is a physical address. + */ + int (*exchange)(struct iommu_table *tbl, + long index, + unsigned long *hpa, + enum dma_data_direction *direction); +#endif + void (*clear)(struct iommu_table *tbl, + long index, long npages); + /* get() returns a physical address */ + unsigned long (*get)(struct iommu_table *tbl, long index); + void (*flush)(struct iommu_table *tbl); + void (*free)(struct iommu_table *tbl); +}; + +/* These are used by VIO */ +extern struct iommu_table_ops iommu_table_lpar_multi_ops; +extern struct iommu_table_ops iommu_table_pseries_ops; + /* * IOMAP_MAX_ORDER defines the largest contiguous block * of dma space we can get. IOMAP_MAX_ORDER = 13 @@ -64,6 +97,9 @@ struct iommu_pool { struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ + unsigned long it_indirect_levels; + unsigned long it_level_size; + unsigned long it_allocated_size; unsigned long it_offset; /* Offset into global table */ unsigned long it_base; /* mapped address of tce table */ unsigned long it_index; /* which iommu table this is */ @@ -75,15 +111,16 @@ struct iommu_table { struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ unsigned long it_page_shift;/* table iommu page size */ -#ifdef CONFIG_IOMMU_API - struct iommu_group *it_group; -#endif - void (*set_bypass)(struct iommu_table *tbl, bool enable); -#ifdef CONFIG_PPC_POWERNV - void *data; -#endif + struct list_head it_group_list;/* List of iommu_table_group_link */ + unsigned long *it_userspace; /* userspace view of the table */ + struct iommu_table_ops *it_ops; }; +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ + ((tbl)->it_userspace ? \ + &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ + NULL) + /* Pure 2^n version of get_order */ static inline __attribute_const__ int get_iommu_order(unsigned long size, struct iommu_table *tbl) @@ -112,14 +149,62 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl, int nid); +#define IOMMU_TABLE_GROUP_MAX_TABLES 2 + +struct iommu_table_group; + +struct iommu_table_group_ops { + unsigned long (*get_table_size)( + __u32 page_shift, + __u64 window_size, + __u32 levels); + long (*create_table)(struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl); + long (*set_window)(struct iommu_table_group *table_group, + int num, + struct iommu_table *tblnew); + long (*unset_window)(struct iommu_table_group *table_group, + int num); + /* Switch ownership from platform code to external user (e.g. VFIO) */ + void (*take_ownership)(struct iommu_table_group *table_group); + /* Switch ownership from external user (e.g. VFIO) back to core */ + void (*release_ownership)(struct iommu_table_group *table_group); +}; + +struct iommu_table_group_link { + struct list_head next; + struct rcu_head rcu; + struct iommu_table_group *table_group; +}; + +struct iommu_table_group { + /* IOMMU properties */ + __u32 tce32_start; + __u32 tce32_size; + __u64 pgsizes; /* Bitmap of supported page sizes */ + __u32 max_dynamic_windows_supported; + __u32 max_levels; + + struct iommu_group *group; + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; + struct iommu_table_group_ops *ops; +}; + #ifdef CONFIG_IOMMU_API -extern void iommu_register_group(struct iommu_table *tbl, + +extern void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num); extern int iommu_add_device(struct device *dev); extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); +extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction); #else -static inline void iommu_register_group(struct iommu_table *tbl, +static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { @@ -140,13 +225,6 @@ static inline int __init tce_iommu_bus_notifier_init(void) } #endif /* !CONFIG_IOMMU_API */ -static inline void set_iommu_table_base_and_group(struct device *dev, - void *base) -{ - set_iommu_table_base(dev, base); - iommu_add_device(dev); -} - extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, struct scatterlist *sglist, int nelems, unsigned long mask, @@ -197,20 +275,13 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long npages); extern int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce); -extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction); -extern unsigned long iommu_clear_tce(struct iommu_table *tbl, - unsigned long entry); -extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages); -extern int iommu_put_tce_user_mode(struct iommu_table *tbl, - unsigned long entry, unsigned long tce); extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); extern void iommu_release_ownership(struct iommu_table *tbl); extern enum dma_data_direction iommu_tce_direction(unsigned long tce); +extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir); #endif /* __KERNEL__ */ #endif /* _ASM_IOMMU_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 0d387d0570cd..952579f5e79a 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -65,31 +65,6 @@ struct machdep_calls { * destroyed as well */ void (*hpte_clear_all)(void); - int (*tce_build)(struct iommu_table *tbl, - long index, - long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*tce_free)(struct iommu_table *tbl, - long index, - long npages); - unsigned long (*tce_get)(struct iommu_table *tbl, - long index); - void (*tce_flush)(struct iommu_table *tbl); - - /* _rm versions are for real mode use only */ - int (*tce_build_rm)(struct iommu_table *tbl, - long index, - long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*tce_free_rm)(struct iommu_table *tbl, - long index, - long npages); - void (*tce_flush_rm)(struct iommu_table *tbl); - void __iomem * (*ioremap)(phys_addr_t addr, unsigned long size, unsigned long flags, void *caller); void (*iounmap)(volatile void __iomem *token); diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 1da6a81ce541..a82f5347540a 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -536,6 +536,9 @@ typedef struct { /* for 4K PTE fragment support */ void *pte_frag; #endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct list_head iommu_group_mem_list; +#endif } mm_context_t; diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 73382eba02dc..3e5184210d9b 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -16,6 +16,24 @@ */ extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm); extern void destroy_context(struct mm_struct *mm); +#ifdef CONFIG_SPAPR_TCE_IOMMU +struct mm_iommu_table_group_mem_t; + +extern bool mm_iommu_preregistered(void); +extern long mm_iommu_get(unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem); +extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem); +extern void mm_iommu_init(mm_context_t *ctx); +extern void mm_iommu_cleanup(mm_context_t *ctx); +extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua, + unsigned long size); +extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua, + unsigned long entries); +extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned long *hpa); +extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); +extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); +#endif extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index a49e5fa6f939..e9e4c52f3685 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -153,7 +153,8 @@ #define OPAL_FLASH_READ 110 #define OPAL_FLASH_WRITE 111 #define OPAL_FLASH_ERASE 112 -#define OPAL_LAST 112 +#define OPAL_PRD_MSG 113 +#define OPAL_LAST 113 /* Device tree flags */ @@ -359,6 +360,7 @@ enum opal_msg_type { OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ OPAL_MSG_HMI_EVT, OPAL_MSG_DPO, + OPAL_MSG_PRD, OPAL_MSG_TYPE_MAX, }; @@ -681,6 +683,23 @@ typedef struct oppanel_line { __be64 line_len; } oppanel_line_t; +enum opal_prd_msg_type { + OPAL_PRD_MSG_TYPE_INIT = 0, /* HBRT --> OPAL */ + OPAL_PRD_MSG_TYPE_FINI, /* HBRT/kernel --> OPAL */ + OPAL_PRD_MSG_TYPE_ATTN, /* HBRT <-- OPAL */ + OPAL_PRD_MSG_TYPE_ATTN_ACK, /* HBRT --> OPAL */ + OPAL_PRD_MSG_TYPE_OCC_ERROR, /* HBRT <-- OPAL */ + OPAL_PRD_MSG_TYPE_OCC_RESET, /* HBRT <-- OPAL */ +}; + +struct opal_prd_msg_header { + uint8_t type; + uint8_t pad[1]; + __be16 size; +}; + +struct opal_prd_msg; + /* * SG entries * diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 1412814347ba..958e941c0cda 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -194,6 +194,7 @@ int64_t opal_ipmi_recv(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t *msg_len); int64_t opal_i2c_request(uint64_t async_token, uint32_t bus_id, struct opal_i2c_request *oreq); +int64_t opal_prd_msg(struct opal_prd_msg *msg); int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf, uint64_t size, uint64_t token); diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 6d17bb8498bf..712add590445 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -27,6 +27,10 @@ struct pci_controller_ops { * allow assignment/enabling of the device. */ bool (*enable_device_hook)(struct pci_dev *); + void (*disable_device)(struct pci_dev *); + + void (*release_device)(struct pci_dev *); + /* Called during PCI resource reassignment */ resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); void (*reset_secondary_bus)(struct pci_dev *dev); @@ -38,6 +42,8 @@ struct pci_controller_ops { #endif int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); + + void (*shutdown)(struct pci_controller *); }; /* @@ -193,7 +199,7 @@ struct pci_dn { struct pci_dn *parent; struct pci_controller *phb; /* for pci devices */ - struct iommu_table *iommu_table; /* for phb's or bridges */ + struct iommu_table_group *table_group; /* for phb's or bridges */ struct device_node *node; /* back-pointer to the device_node */ int pci_ext_config_space; /* for pci devices */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index f951d9cf358a..f890f7ce1593 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -347,11 +347,27 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) /* Encode and de-code a swap entry */ -#define __swp_type(entry) (((entry).val >> 1) & 0x3f) -#define __swp_offset(entry) ((entry).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)}) -#define __pte_to_swp_entry(pte) ((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT}) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val << PTE_RPN_SHIFT }) +#define MAX_SWAPFILES_CHECK() do { \ + BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ + /* \ + * Don't have overlapping bits with _PAGE_HPTEFLAGS \ + * We filter HPTEFLAGS on set_pte. \ + */ \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + } while (0) +/* + * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; + */ +#define SWP_TYPE_BITS 5 +#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ + & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> PTE_RPN_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << _PAGE_BIT_SWAP_TYPE) \ + | ((offset) << PTE_RPN_SHIFT) }) + +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) +#define __swp_entry_to_pte(x) __pte((x).val) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index f9b498292a5c..6f77f71ee964 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -11,7 +11,7 @@ #define _ASM_PNV_PCI_H #include <linux/pci.h> -#include <misc/cxl.h> +#include <misc/cxl-base.h> int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode); int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index bf117d8fb45f..28ded5d9b579 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -295,6 +295,15 @@ struct thread_struct { #endif #ifdef CONFIG_PPC64 unsigned long dscr; + /* + * This member element dscr_inherit indicates that the process + * has explicitly attempted and changed the DSCR register value + * for itself. Hence kernel wont use the default CPU DSCR value + * contained in the PACA structure anymore during process context + * switch. Once this variable is set, this behaviour will also be + * inherited to all the children of this process from that point + * onwards. + */ int dscr_inherit; unsigned long ppr; /* used to save/restore SMT priority */ #endif diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 91a704952ca1..8d8473278d91 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -11,6 +11,7 @@ /* Architected bits */ #define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ #define _PAGE_SW1 0x000002 +#define _PAGE_BIT_SWAP_TYPE 2 #define _PAGE_BAP_SR 0x000004 #define _PAGE_BAP_UR 0x000008 #define _PAGE_BAP_SW 0x000010 diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index c5a755ef7011..b7c8d079c121 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -85,10 +85,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); * 64-bit PTEs */ #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) -#define PTE_RPN_MAX (1ULL << (64 - PTE_RPN_SHIFT)) #define PTE_RPN_MASK (~((1ULL<<PTE_RPN_SHIFT)-1)) #else -#define PTE_RPN_MAX (1UL << (32 - PTE_RPN_SHIFT)) #define PTE_RPN_MASK (~((1UL<<PTE_RPN_SHIFT)-1)) #endif diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index fc852f7e7b3a..ef612c160da7 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -16,6 +16,7 @@ */ #define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */ #define _PAGE_USER 0x0002 /* matches one of the PP bits */ +#define _PAGE_BIT_SWAP_TYPE 2 #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we invert) */ #define _PAGE_GUARDED 0x0008 /* We can derive Memory coherence from _PAGE_NO_CACHE */ diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index c15da6073cb8..8e86b48d0369 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -144,6 +144,26 @@ TRACE_EVENT_FN(opal_exit, ); #endif +TRACE_EVENT(hash_fault, + + TP_PROTO(unsigned long addr, unsigned long access, unsigned long trap), + TP_ARGS(addr, access, trap), + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, access) + __field(unsigned long, trap) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->access = access; + __entry->trap = trap; + ), + + TP_printk("hash fault with addr 0x%lx and access = 0x%lx trap = 0x%lx", + __entry->addr, __entry->access, __entry->trap) +); + #endif /* _TRACE_POWERPC_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a0c071d24e0e..2a8ebae0936b 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -265,7 +265,7 @@ do { \ ({ \ long __gu_err; \ unsigned long __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ @@ -279,7 +279,7 @@ do { \ ({ \ long __gu_err; \ long long __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ @@ -293,7 +293,7 @@ do { \ ({ \ long __gu_err = -EFAULT; \ unsigned long __gu_val = 0; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ might_fault(); \ if (access_ok(VERIFY_READ, __gu_addr, (size))) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ @@ -305,7 +305,7 @@ do { \ ({ \ long __gu_err; \ unsigned long __gu_val; \ - const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild index 79c4068be278..f44a027818af 100644 --- a/arch/powerpc/include/uapi/asm/Kbuild +++ b/arch/powerpc/include/uapi/asm/Kbuild @@ -18,6 +18,7 @@ header-y += kvm_para.h header-y += mman.h header-y += msgbuf.h header-y += nvram.h +header-y += opal-prd.h header-y += param.h header-y += perf_event.h header-y += poll.h diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index de2c0e4ee1aa..43686043e297 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -42,5 +42,6 @@ #define PPC_FEATURE2_ISEL 0x08000000 #define PPC_FEATURE2_TAR 0x04000000 #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#define PPC_FEATURE2_HTM_NOSC 0x01000000 #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */ diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h new file mode 100644 index 000000000000..319ff4a26158 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/opal-prd.h @@ -0,0 +1,58 @@ +/* + * OPAL Runtime Diagnostics interface driver + * Supported on POWERNV platform + * + * (C) Copyright IBM 2015 + * + * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com> + * Author: Jeremy Kerr <jk@ozlabs.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _UAPI_ASM_POWERPC_OPAL_PRD_H_ +#define _UAPI_ASM_POWERPC_OPAL_PRD_H_ + +#include <linux/types.h> + +/** + * The version of the kernel interface of the PRD system. This describes the + * interface available for the /dev/opal-prd device. The actual PRD message + * layout and content is private to the firmware <--> userspace interface, so + * is not covered by this versioning. + * + * Future interface versions are backwards-compatible; if a later kernel + * version is encountered, functionality provided in earlier versions + * will work. + */ +#define OPAL_PRD_KERNEL_VERSION 1 + +#define OPAL_PRD_GET_INFO _IOR('o', 0x01, struct opal_prd_info) +#define OPAL_PRD_SCOM_READ _IOR('o', 0x02, struct opal_prd_scom) +#define OPAL_PRD_SCOM_WRITE _IOW('o', 0x03, struct opal_prd_scom) + +#ifndef __ASSEMBLY__ + +struct opal_prd_info { + __u64 version; + __u64 reserved[3]; +}; + +struct opal_prd_scom { + __u64 chip; + __u64 addr; + __u64 data; + __s64 rc; +}; + +#endif /* __ASSEMBLY__ */ + +#endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */ diff --git a/arch/powerpc/include/uapi/asm/tm.h b/arch/powerpc/include/uapi/asm/tm.h index 5d836b7c1176..5047659815a5 100644 --- a/arch/powerpc/include/uapi/asm/tm.h +++ b/arch/powerpc/include/uapi/asm/tm.h @@ -11,7 +11,7 @@ #define TM_CAUSE_RESCHED 0xde #define TM_CAUSE_TLBI 0xdc #define TM_CAUSE_FAC_UNAV 0xda -#define TM_CAUSE_SYSCALL 0xd8 /* future use */ +#define TM_CAUSE_SYSCALL 0xd8 #define TM_CAUSE_MISC 0xd6 /* future use */ #define TM_CAUSE_SIGNAL 0xd4 #define TM_CAUSE_ALIGNMENT 0xd2 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0034b6b3556a..98230579d99c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -247,7 +247,7 @@ int main(void) #endif DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state)); - DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default)); + DEFINE(PACA_DSCR_DEFAULT, offsetof(struct paca_struct, dscr_default)); DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user)); DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 60262fdf35ba..7d80bfdfb15e 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -108,7 +108,9 @@ extern void __restore_cpu_e6500(void); PPC_FEATURE_TRUE_LE | \ PPC_FEATURE_PSERIES_PERFMON_COMPAT) #define COMMON_USER2_POWER8 (PPC_FEATURE2_ARCH_2_07 | \ - PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \ + PPC_FEATURE2_HTM_COMP | \ + PPC_FEATURE2_HTM_NOSC_COMP | \ + PPC_FEATURE2_DSCR | \ PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \ PPC_FEATURE2_VEC_CRYPTO) #define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 23e0aa773643..af9b597b10af 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -717,7 +717,7 @@ static void *eeh_restore_dev_state(void *data, void *userdata) /* The caller should restore state for the specified device */ if (pdev != dev) - pci_save_state(pdev); + pci_restore_state(pdev); return NULL; } @@ -1410,13 +1410,11 @@ static int dev_has_iommu_table(struct device *dev, void *data) { struct pci_dev *pdev = to_pci_dev(dev); struct pci_dev **ppdev = data; - struct iommu_table *tbl; if (!dev) return 0; - tbl = get_iommu_table_base(dev); - if (tbl && tbl->it_group) { + if (dev->iommu_group) { *ppdev = pdev; return 1; } diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index afbc20019c2e..579e0f9a2d57 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -34,6 +34,7 @@ #include <asm/ftrace.h> #include <asm/hw_irq.h> #include <asm/context_tracking.h> +#include <asm/tm.h> /* * System calls. @@ -51,6 +52,12 @@ exception_marker: .globl system_call_common system_call_common: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ + bne tabort_syscall +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif andi. r10,r12,MSR_PR mr r10,r1 addi r1,r1,-INT_FRAME_SIZE @@ -311,6 +318,34 @@ syscall_exit_work: bl do_syscall_trace_leave b ret_from_except +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +tabort_syscall: + /* Firstly we need to enable TM in the kernel */ + mfmsr r10 + li r13, 1 + rldimi r10, r13, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r10, 0 + + /* tabort, this dooms the transaction, nothing else */ + li r13, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT) + TABORT(R13) + + /* + * Return directly to userspace. We have corrupted user register state, + * but userspace will never see that register state. Execution will + * resume after the tbegin of the aborted transaction with the + * checkpointed register state. + */ + li r13, MSR_RI + andc r10, r10, r13 + mtmsrd r10, 1 + mtspr SPRN_SRR0, r11 + mtspr SPRN_SRR1, r12 + + rfid + b . /* prevent speculative execution */ +#endif + /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) ld r11,_TRAP(r1) @@ -556,7 +591,7 @@ BEGIN_FTR_SECTION ld r0,THREAD_DSCR(r4) cmpwi r6,0 bne 1f - ld r0,PACA_DSCR(r13) + ld r0,PACA_DSCR_DEFAULT(r13) 1: BEGIN_FTR_SECTION_NESTED(70) mfspr r8, SPRN_FSCR diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9519e6bdc6d7..0a0399c2af11 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -59,14 +59,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #if defined(CONFIG_RELOCATABLE) /* - * We can't branch directly; in the direct case we use LR - * and system_call_entry restores LR. (We thus need to move - * LR to r10 in the RFID case too.) + * We can't branch directly so we do it via the CTR which + * is volatile across system calls. */ #define SYSCALL_PSERIES_2_DIRECT \ mflr r10 ; \ ld r12,PACAKBASE(r13) ; \ - LOAD_HANDLER(r12, system_call_entry_direct) ; \ + LOAD_HANDLER(r12, system_call_entry) ; \ mtctr r12 ; \ mfspr r12,SPRN_SRR1 ; \ /* Re-use of r13... No spare regs to do this */ \ @@ -80,7 +79,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ - b system_call_entry_direct ; + b system_call_common ; #endif /* @@ -969,13 +968,6 @@ hv_facility_unavailable_relon_trampoline: __end_interrupts: .align 7 -system_call_entry_direct: -#if defined(CONFIG_RELOCATABLE) - /* The first level prologue may have used LR to get here, saving - * orig in r10. To save hacking/ifdeffing common code, restore here. - */ - mtlr r10 -#endif system_call_entry: b system_call_common diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index b054f33ab1fb..a8e3490b54e3 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, ret = entry << tbl->it_page_shift; /* Set the return dma address */ /* Put the TCEs in the HW table */ - build_fail = ppc_md.tce_build(tbl, entry, npages, + build_fail = tbl->it_ops->set(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK(tbl), direction, attrs); - /* ppc_md.tce_build() only returns non-zero for transient errors. + /* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return * DMA_ERROR_CODE. For all other errors the functionality is * not altered. @@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, } /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); /* Make sure updates are seen by hardware */ mb(); @@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, if (!iommu_free_check(tbl, dma_addr, npages)) return; - ppc_md.tce_free(tbl, entry, npages); + tbl->it_ops->clear(tbl, entry, npages); spin_lock_irqsave(&(pool->lock), flags); bitmap_clear(tbl->it_map, free_entry, npages); @@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, * not do an mb() here on purpose, it is not needed on any of * the current platforms. */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); } int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, @@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages, entry, dma_addr); /* Insert into HW table */ - build_fail = ppc_md.tce_build(tbl, entry, npages, + build_fail = tbl->it_ops->set(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK(tbl), direction, attrs); if(unlikely(build_fail)) @@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, } /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); DBG("mapped %d elements:\n", outcount); @@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, * do not do an mb() here, the affected platforms do not need it * when freeing. */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); } static void iommu_table_clear(struct iommu_table *tbl) @@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl) */ if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */ - ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); + tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size); return; } #ifdef CONFIG_CRASH_DUMP - if (ppc_md.tce_get) { + if (tbl->it_ops->get) { unsigned long index, tceval, tcecount = 0; /* Reserve the existing mappings left by the first kernel. */ for (index = 0; index < tbl->it_size; index++) { - tceval = ppc_md.tce_get(tbl, index + tbl->it_offset); + tceval = tbl->it_ops->get(tbl, index + tbl->it_offset); /* * Freed TCE entry contains 0x7fffffffffffffff on JS20 */ @@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid) unsigned int i; struct iommu_pool *p; + BUG_ON(!tbl->it_ops); + /* number of bytes needed for the bitmap */ sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); @@ -713,9 +715,11 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) unsigned long bitmap_sz; unsigned int order; - if (!tbl || !tbl->it_map) { - printk(KERN_ERR "%s: expected TCE map for %s\n", __func__, - node_name); + if (!tbl) + return; + + if (!tbl->it_map) { + kfree(tbl); return; } @@ -726,13 +730,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) if (tbl->it_offset == 0) clear_bit(0, tbl->it_map); -#ifdef CONFIG_IOMMU_API - if (tbl->it_group) { - iommu_group_put(tbl->it_group); - BUG_ON(tbl->it_group); - } -#endif - /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); @@ -871,17 +868,33 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, } } +unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + return TCE_PCI_READ | TCE_PCI_WRITE; + case DMA_FROM_DEVICE: + return TCE_PCI_WRITE; + case DMA_TO_DEVICE: + return TCE_PCI_READ; + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm); + #ifdef CONFIG_IOMMU_API /* * SPAPR TCE API */ static void group_release(void *iommu_data) { - struct iommu_table *tbl = iommu_data; - tbl->it_group = NULL; + struct iommu_table_group *table_group = iommu_data; + + table_group->group = NULL; } -void iommu_register_group(struct iommu_table *tbl, +void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, unsigned long pe_num) { struct iommu_group *grp; @@ -893,8 +906,8 @@ void iommu_register_group(struct iommu_table *tbl, PTR_ERR(grp)); return; } - tbl->it_group = grp; - iommu_group_set_iommudata(grp, tbl, group_release); + table_group->group = grp; + iommu_group_set_iommudata(grp, table_group, group_release); name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", pci_domain_number, pe_num); if (!name) @@ -919,8 +932,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction); void iommu_flush_tce(struct iommu_table *tbl) { /* Flush/invalidate TLB caches if necessary */ - if (ppc_md.tce_flush) - ppc_md.tce_flush(tbl); + if (tbl->it_ops->flush) + tbl->it_ops->flush(tbl); /* Make sure updates are seen by hardware */ mb(); @@ -931,7 +944,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce_value, unsigned long npages) { - /* ppc_md.tce_free() does not support any value but 0 */ + /* tbl->it_ops->clear() does not support any value but 0 */ if (tce_value) return -EINVAL; @@ -952,10 +965,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); int iommu_tce_put_param_check(struct iommu_table *tbl, unsigned long ioba, unsigned long tce) { - if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) - return -EINVAL; - - if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) + if (tce & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; if (ioba & ~IOMMU_PAGE_MASK(tbl)) @@ -972,68 +982,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl, } EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); -unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) -{ - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); - - oldtce = ppc_md.tce_get(tbl, entry); - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) - ppc_md.tce_free(tbl, entry, 1); - else - oldtce = 0; - - spin_unlock(&(pool->lock)); - - return oldtce; -} -EXPORT_SYMBOL_GPL(iommu_clear_tce); - -int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - unsigned long oldtce; - struct page *page; - - for ( ; pages; --pages, ++entry) { - oldtce = iommu_clear_tce(tbl, entry); - if (!oldtce) - continue; - - page = pfn_to_page(oldtce >> PAGE_SHIFT); - WARN_ON(!page); - if (page) { - if (oldtce & TCE_PCI_WRITE) - SetPageDirty(page); - put_page(page); - } - } - - return 0; -} -EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); - -/* - * hwaddr is a kernel virtual address here (0xc... bazillion), - * tce_build converts it to a physical address. - */ -int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, - unsigned long hwaddr, enum dma_data_direction direction) +long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction) { - int ret = -EBUSY; - unsigned long oldtce; - struct iommu_pool *pool = get_pool(tbl, entry); - - spin_lock(&(pool->lock)); + long ret; - oldtce = ppc_md.tce_get(tbl, entry); - /* Add new entry if it is not busy */ - if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) - ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); - spin_unlock(&(pool->lock)); + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) + SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); /* if (unlikely(ret)) pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", @@ -1042,84 +1000,72 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, return ret; } -EXPORT_SYMBOL_GPL(iommu_tce_build); +EXPORT_SYMBOL_GPL(iommu_tce_xchg); -int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, - unsigned long tce) +int iommu_take_ownership(struct iommu_table *tbl) { - int ret; - struct page *page = NULL; - unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; - enum dma_data_direction direction = iommu_tce_direction(tce); - - ret = get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page); - if (unlikely(ret != 1)) { - /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", - tce, entry << tbl->it_page_shift, ret); */ - return -EFAULT; - } - hwaddr = (unsigned long) page_address(page) + offset; - - ret = iommu_tce_build(tbl, entry, hwaddr, direction); - if (ret) - put_page(page); - - if (ret < 0) - pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", - __func__, entry << tbl->it_page_shift, tce, ret); + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; + int ret = 0; - return ret; -} -EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); + /* + * VFIO does not control TCE entries allocation and the guest + * can write new TCEs on top of existing ones so iommu_tce_build() + * must be able to release old pages. This functionality + * requires exchange() callback defined so if it is not + * implemented, we disallow taking ownership over the table. + */ + if (!tbl->it_ops->exchange) + return -EINVAL; -int iommu_take_ownership(struct iommu_table *tbl) -{ - unsigned long sz = (tbl->it_size + 7) >> 3; + spin_lock_irqsave(&tbl->large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(&tbl->pools[i].lock); if (tbl->it_offset == 0) clear_bit(0, tbl->it_map); if (!bitmap_empty(tbl->it_map, tbl->it_size)) { pr_err("iommu_tce: it_map is not empty"); - return -EBUSY; + ret = -EBUSY; + /* Restore bit#0 set by iommu_init_table() */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + } else { + memset(tbl->it_map, 0xff, sz); } - memset(tbl->it_map, 0xff, sz); - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(&tbl->pools[i].lock); + spin_unlock_irqrestore(&tbl->large_pool.lock, flags); - /* - * Disable iommu bypass, otherwise the user can DMA to all of - * our physical memory via the bypass window instead of just - * the pages that has been explicitly mapped into the iommu - */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, false); - - return 0; + return ret; } EXPORT_SYMBOL_GPL(iommu_take_ownership); void iommu_release_ownership(struct iommu_table *tbl) { - unsigned long sz = (tbl->it_size + 7) >> 3; + unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; + + spin_lock_irqsave(&tbl->large_pool.lock, flags); + for (i = 0; i < tbl->nr_pools; i++) + spin_lock(&tbl->pools[i].lock); - iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); memset(tbl->it_map, 0, sz); /* Restore bit#0 set by iommu_init_table() */ if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - /* The kernel owns the device now, we can restore the iommu bypass */ - if (tbl->set_bypass) - tbl->set_bypass(tbl, true); + for (i = 0; i < tbl->nr_pools; i++) + spin_unlock(&tbl->pools[i].lock); + spin_unlock_irqrestore(&tbl->large_pool.lock, flags); } EXPORT_SYMBOL_GPL(iommu_release_ownership); int iommu_add_device(struct device *dev) { struct iommu_table *tbl; + struct iommu_table_group_link *tgl; /* * The sysfs entries should be populated before @@ -1137,15 +1083,22 @@ int iommu_add_device(struct device *dev) } tbl = get_iommu_table_base(dev); - if (!tbl || !tbl->it_group) { + if (!tbl) { pr_debug("%s: Skipping device %s with no tbl\n", __func__, dev_name(dev)); return 0; } + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + if (!tgl) { + pr_debug("%s: Skipping device %s with no group\n", + __func__, dev_name(dev)); + return 0; + } pr_debug("%s: Adding %s to iommu group %d\n", __func__, dev_name(dev), - iommu_group_id(tbl->it_group)); + iommu_group_id(tgl->table_group->group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", @@ -1154,7 +1107,7 @@ int iommu_add_device(struct device *dev) return -EINVAL; } - return iommu_group_add_device(tbl->it_group, dev); + return iommu_group_add_device(tgl->table_group->group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 0d054068a21d..b9de34d44fcb 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -89,6 +89,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev) #endif return phb; } +EXPORT_SYMBOL_GPL(pcibios_alloc_controller); void pcibios_free_controller(struct pci_controller *phb) { @@ -1447,6 +1448,7 @@ void pcibios_claim_one_bus(struct pci_bus *bus) list_for_each_entry(child_bus, &bus->children, node) pcibios_claim_one_bus(child_bus); } +EXPORT_SYMBOL_GPL(pcibios_claim_one_bus); /* pcibios_finish_adding_to_bus @@ -1488,6 +1490,14 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return pci_enable_resources(dev, mask); } +void pcibios_disable_device(struct pci_dev *dev) +{ + struct pci_controller *phb = pci_bus_to_host(dev->bus); + + if (phb->controller_ops.disable_device) + phb->controller_ops.disable_device(dev); +} + resource_size_t pcibios_io_space_offset(struct pci_controller *hose) { return (unsigned long) hose->io_base_virt - _IO_BASE; @@ -1680,6 +1690,7 @@ void pcibios_scan_phb(struct pci_controller *hose) pcie_bus_configure_settings(child); } } +EXPORT_SYMBOL_GPL(pcibios_scan_phb); static void fixup_hide_host_resource_fsl(struct pci_dev *dev) { diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 7ed85a69a9c2..7f9ed0c1f6b9 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -29,7 +29,12 @@ */ void pcibios_release_device(struct pci_dev *dev) { + struct pci_controller *phb = pci_bus_to_host(dev->bus); + eeh_remove_device(dev); + + if (phb->controller_ops.release_device) + phb->controller_ops.release_device(dev); } /** diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index febb50dd5328..8005e18d1b40 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1112,7 +1112,6 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) /* * Copy a thread.. */ -extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */ /* * Copy architecture-specific thread state diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 9fe3dcdbfca7..bdcbb716f4d6 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -687,6 +687,9 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_64K_PAGES init_mm.context.pte_frag = NULL; #endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(&init_mm.context); +#endif irqstack_early_init(); exc_lvl_early_init(); emergency_stack_init(); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index fa1fd8a0c867..692873bff334 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -496,13 +496,34 @@ static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); +/* + * This is the system wide DSCR register default value. Any + * change to this default value through the sysfs interface + * will update all per cpu DSCR default values across the + * system stored in their respective PACA structures. + */ static unsigned long dscr_default; +/** + * read_dscr() - Fetch the cpu specific DSCR default + * @val: Returned cpu specific DSCR default value + * + * This function returns the per cpu DSCR default value + * for any cpu which is contained in it's PACA structure. + */ static void read_dscr(void *val) { *(unsigned long *)val = get_paca()->dscr_default; } + +/** + * write_dscr() - Update the cpu specific DSCR default + * @val: New cpu specific DSCR default value to update + * + * This function updates the per cpu DSCR default value + * for any cpu which is contained in it's PACA structure. + */ static void write_dscr(void *val) { get_paca()->dscr_default = *(unsigned long *)val; @@ -520,12 +541,29 @@ static void add_write_permission_dev_attr(struct device_attribute *attr) attr->attr.mode |= 0200; } +/** + * show_dscr_default() - Fetch the system wide DSCR default + * @dev: Device structure + * @attr: Device attribute structure + * @buf: Interface buffer + * + * This function returns the system wide DSCR default value. + */ static ssize_t show_dscr_default(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%lx\n", dscr_default); } +/** + * store_dscr_default() - Update the system wide DSCR default + * @dev: Device structure + * @attr: Device attribute structure + * @buf: Interface buffer + * @count: Size of the update + * + * This function updates the system wide DSCR default value. + */ static ssize_t __used store_dscr_default(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 5754b226da7e..bf8f34a58670 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -293,7 +293,7 @@ dont_backup_fp: ld r2, STK_GOT(r1) /* Load CPU's default DSCR */ - ld r0, PACA_DSCR(r13) + ld r0, PACA_DSCR_DEFAULT(r13) mtspr SPRN_DSCR, r0 blr @@ -473,7 +473,7 @@ restore_gprs: ld r2, STK_GOT(r1) /* Load CPU's default DSCR */ - ld r0, PACA_DSCR(r13) + ld r0, PACA_DSCR_DEFAULT(r13) mtspr SPRN_DSCR, r0 blr diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 19e4744b6eba..6530f1b8874d 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1377,6 +1377,7 @@ void facility_unavailable_exception(struct pt_regs *regs) }; char *facility = "unknown"; u64 value; + u32 instword, rd; u8 status; bool hv; @@ -1388,12 +1389,46 @@ void facility_unavailable_exception(struct pt_regs *regs) status = value >> 56; if (status == FSCR_DSCR_LG) { - /* User is acessing the DSCR. Set the inherit bit and allow - * the user to set it directly in future by setting via the - * FSCR DSCR bit. We always leave HFSCR DSCR set. + /* + * User is accessing the DSCR register using the problem + * state only SPR number (0x03) either through a mfspr or + * a mtspr instruction. If it is a write attempt through + * a mtspr, then we set the inherit bit. This also allows + * the user to write or read the register directly in the + * future by setting via the FSCR DSCR bit. But in case it + * is a read DSCR attempt through a mfspr instruction, we + * just emulate the instruction instead. This code path will + * always emulate all the mfspr instructions till the user + * has attempted atleast one mtspr instruction. This way it + * preserves the same behaviour when the user is accessing + * the DSCR through privilege level only SPR number (0x11) + * which is emulated through illegal instruction exception. + * We always leave HFSCR DSCR set. */ - current->thread.dscr_inherit = 1; - mtspr(SPRN_FSCR, value | FSCR_DSCR); + if (get_user(instword, (u32 __user *)(regs->nip))) { + pr_err("Failed to fetch the user instruction\n"); + return; + } + + /* Write into DSCR (mtspr 0x03, RS) */ + if ((instword & PPC_INST_MTSPR_DSCR_USER_MASK) + == PPC_INST_MTSPR_DSCR_USER) { + rd = (instword >> 21) & 0x1f; + current->thread.dscr = regs->gpr[rd]; + current->thread.dscr_inherit = 1; + mtspr(SPRN_FSCR, value | FSCR_DSCR); + } + + /* Read from DSCR (mfspr RT, 0x03) */ + if ((instword & PPC_INST_MFSPR_DSCR_USER_MASK) + == PPC_INST_MFSPR_DSCR_USER) { + if (emulate_instruction(regs)) { + pr_err("DSCR based mfspr emulation failed\n"); + return; + } + regs->nip += 4; + emulate_single_step(regs); + } return; } diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5bfdab9047be..b41426c60ef6 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) tbl->it_type = TCE_VB; tbl->it_blocksize = 16; + if (firmware_has_feature(FW_FEATURE_LPAR)) + tbl->it_ops = &iommu_table_lpar_multi_ops; + else + tbl->it_ops = &iommu_table_pseries_ops; + return iommu_init_table(tbl, -1); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 4d70df26c402..faa86e9c0551 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -324,7 +324,7 @@ kvm_start_guest: kvm_secondary_got_guest: /* Set HSTATE_DSCR(r13) to something sensible */ - ld r6, PACA_DSCR(r13) + ld r6, PACA_DSCR_DEFAULT(r13) std r6, HSTATE_DSCR(r13) /* Order load of vcore, ptid etc. after load of vcpu */ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 7902802a19a5..a47e14277fd8 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -33,6 +33,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o obj-$(CONFIG_ALTIVEC) += xor_vmx.o -CFLAGS_xor_vmx.o += -maltivec -mabi=altivec +CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec) obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 9c8770b5f96f..3eb73a38220d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o +obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index f031a47d7701..6527882ce05e 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -26,7 +26,7 @@ #include <asm/reg.h> #include <asm/copro.h> #include <asm/spu.h> -#include <misc/cxl.h> +#include <misc/cxl-base.h> /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -100,7 +100,7 @@ EXPORT_SYMBOL_GPL(copro_handle_mm_fault); int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) { - u64 vsid; + u64 vsid, vsidkey; int psize, ssize; switch (REGION_ID(ea)) { @@ -109,6 +109,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) psize = get_slice_psize(mm, ea); ssize = user_segment_size(ea); vsid = get_vsid(mm->context.id, ea, ssize); + vsidkey = SLB_VSID_USER; break; case VMALLOC_REGION_ID: pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); @@ -118,19 +119,21 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) psize = mmu_io_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; break; case KERNEL_REGION_ID: pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); psize = mmu_linear_psize; ssize = mmu_kernel_ssize; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + vsidkey = SLB_VSID_KERNEL; break; default: pr_debug("%s: invalid region access at %016llx\n", __func__, ea); return 1; } - vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER; + vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey; vsid |= mmu_psize_defs[psize].sllp | ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 9c4880ddecd6..13befa35d8a8 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -29,7 +29,7 @@ #include <asm/kexec.h> #include <asm/ppc-opcode.h> -#include <misc/cxl.h> +#include <misc/cxl-base.h> #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fda236f908eb..5ec987f65b2c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -57,6 +57,7 @@ #include <asm/fadump.h> #include <asm/firmware.h> #include <asm/tm.h> +#include <asm/trace.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -1004,6 +1005,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", ea, access, trap); + trace_hash_fault(ea, access, trap); /* Get region & vsid */ switch (REGION_ID(ea)) { @@ -1475,7 +1477,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long hash; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); - unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); + unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); long ret; hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 178876aef40f..4e4efbc2658e 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) #ifdef CONFIG_PPC_64K_PAGES mm->context.pte_frag = NULL; #endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(&mm->context); +#endif return 0; } @@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm) void destroy_context(struct mm_struct *mm) { +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_cleanup(&mm->context); +#endif #ifdef CONFIG_PPC_ICSWX drop_cop(mm->context.acop, mm); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c new file mode 100644 index 000000000000..da6a2168ae9e --- /dev/null +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -0,0 +1,316 @@ +/* + * IOMMU helpers in MMU context. + * + * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/rculist.h> +#include <linux/vmalloc.h> +#include <linux/mutex.h> +#include <asm/mmu_context.h> + +static DEFINE_MUTEX(mem_list_mutex); + +struct mm_iommu_table_group_mem_t { + struct list_head next; + struct rcu_head rcu; + unsigned long used; + atomic64_t mapped; + u64 ua; /* userspace address */ + u64 entries; /* number of entries in hpas[] */ + u64 *hpas; /* vmalloc'ed */ +}; + +static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, + unsigned long npages, bool incr) +{ + long ret = 0, locked, lock_limit; + + if (!npages) + return 0; + + down_write(&mm->mmap_sem); + + if (incr) { + locked = mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + mm->locked_vm += npages; + } else { + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", + current->pid, + incr ? '+' : '-', + npages << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(&mm->mmap_sem); + + return ret; +} + +bool mm_iommu_preregistered(void) +{ + if (!current || !current->mm) + return false; + + return !list_empty(¤t->mm->context.iommu_group_mem_list); +} +EXPORT_SYMBOL_GPL(mm_iommu_preregistered); + +long mm_iommu_get(unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + struct mm_iommu_table_group_mem_t *mem; + long i, j, ret = 0, locked_entries = 0; + struct page *page = NULL; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + mutex_lock(&mem_list_mutex); + + list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ++mem->used; + *pmem = mem; + goto unlock_exit; + } + + /* Overlap? */ + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem->ua + + (mem->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + goto unlock_exit; + } + + } + + ret = mm_iommu_adjust_locked_vm(current->mm, entries, true); + if (ret) + goto unlock_exit; + + locked_entries = entries; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) { + ret = -ENOMEM; + goto unlock_exit; + } + + mem->hpas = vzalloc(entries * sizeof(mem->hpas[0])); + if (!mem->hpas) { + kfree(mem); + ret = -ENOMEM; + goto unlock_exit; + } + + for (i = 0; i < entries; ++i) { + if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), + 1/* pages */, 1/* iswrite */, &page)) { + for (j = 0; j < i; ++j) + put_page(pfn_to_page( + mem->hpas[j] >> PAGE_SHIFT)); + vfree(mem->hpas); + kfree(mem); + ret = -EFAULT; + goto unlock_exit; + } + + mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + } + + atomic64_set(&mem->mapped, 1); + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + *pmem = mem; + + list_add_rcu(&mem->next, ¤t->mm->context.iommu_group_mem_list); + +unlock_exit: + if (locked_entries && ret) + mm_iommu_adjust_locked_vm(current->mm, locked_entries, false); + + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_get); + +static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) +{ + long i; + struct page *page = NULL; + + for (i = 0; i < mem->entries; ++i) { + if (!mem->hpas[i]) + continue; + + page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT); + if (!page) + continue; + + put_page(page); + mem->hpas[i] = 0; + } +} + +static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem) +{ + + mm_iommu_unpin(mem); + vfree(mem->hpas); + kfree(mem); +} + +static void mm_iommu_free(struct rcu_head *head) +{ + struct mm_iommu_table_group_mem_t *mem = container_of(head, + struct mm_iommu_table_group_mem_t, rcu); + + mm_iommu_do_free(mem); +} + +static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) +{ + list_del_rcu(&mem->next); + mm_iommu_adjust_locked_vm(current->mm, mem->entries, false); + call_rcu(&mem->rcu, mm_iommu_free); +} + +long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem) +{ + long ret = 0; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + mutex_lock(&mem_list_mutex); + + if (mem->used == 0) { + ret = -ENOENT; + goto unlock_exit; + } + + --mem->used; + /* There are still users, exit */ + if (mem->used) + goto unlock_exit; + + /* Are there still mappings? */ + if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) { + ++mem->used; + ret = -EBUSY; + goto unlock_exit; + } + + /* @mapped became 0 so now mappings are disabled, release the region */ + mm_iommu_release(mem); + +unlock_exit: + mutex_unlock(&mem_list_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_put); + +struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua, + unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, + ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup); + +struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua, + unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_rcu(mem, + ¤t->mm->context.iommu_group_mem_list, + next) { + if ((mem->ua == ua) && (mem->entries == entries)) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_find); + +long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + u64 *va = &mem->hpas[entry]; + + if (entry >= mem->entries) + return -EFAULT; + + *hpa = *va | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); + +long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) +{ + if (atomic64_inc_not_zero(&mem->mapped)) + return 0; + + /* Last mm_iommu_put() has been called, no more mappings allowed() */ + return -ENXIO; +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc); + +void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem) +{ + atomic64_add_unless(&mem->mapped, -1, 1); +} +EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec); + +void mm_iommu_init(mm_context_t *ctx) +{ + INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list); +} + +void mm_iommu_cleanup(mm_context_t *ctx) +{ + struct mm_iommu_table_group_mem_t *mem, *tmp; + + list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) { + list_del_rcu(&mem->next); + mm_iommu_do_free(mem); + } +} diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 12b638425bb9..d90893b76e7c 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -131,7 +131,16 @@ static void pmao_restore_workaround(bool ebb) { } static bool regs_use_siar(struct pt_regs *regs) { - return !!regs->result; + /* + * When we take a performance monitor exception the regs are setup + * using perf_read_regs() which overloads some fields, in particular + * regs->result to tell us whether to use SIAR. + * + * However if the regs are from another exception, eg. a syscall, then + * they have not been setup using perf_read_regs() and so regs->result + * is something random. + */ + return ((TRAP(regs) == 0xf00) && regs->result); } /* diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index c949ca055712..63016621aff8 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -193,7 +193,7 @@ static struct irq_chip mpc52xx_gpt_irq_chip = { void mpc52xx_gpt_irq_cascade(unsigned int virq, struct irq_desc *desc) { - struct mpc52xx_gpt_priv *gpt = irq_get_handler_data(virq); + struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc); int sub_virq; u32 status; diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index d9b8a443458f..fe51de4fcf13 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -96,7 +96,7 @@ static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val) static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct axon_msic *msic = irq_get_handler_data(irq); + struct axon_msic *msic = irq_desc_get_handler_data(desc); u32 write_offset, msi; int idx; int retry = 0; diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 21b502398bf3..14a582b21274 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np) return *ioid; } +static struct iommu_table_ops cell_iommu_ops = { + .set = tce_build_cell, + .clear = tce_free_cell +}; + static struct iommu_window * __init cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, unsigned long offset, unsigned long size, @@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np, window->table.it_offset = (offset >> window->table.it_page_shift) + pte_offset; window->table.it_size = size >> window->table.it_page_shift; + window->table.it_ops = &cell_iommu_ops; iommu_init_table(&window->table, iommu->nid); @@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void) /* Setup various callbacks */ cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup; ppc_md.dma_get_required_mask = cell_dma_get_required_mask; - ppc_md.tce_build = tce_build_cell; - ppc_md.tce_free = tce_free_cell; if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0) goto bail; diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index c269caee58f9..9dd154d6f89a 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -124,7 +124,7 @@ static void hlwd_pic_irq_cascade(unsigned int cascade_virq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - struct irq_domain *irq_domain = irq_get_handler_data(cascade_virq); + struct irq_domain *irq_domain = irq_desc_get_handler_data(desc); unsigned int virq; raw_spin_lock(&desc->lock); diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index b8f567b2ea19..c929644e74a6 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index, } } +static struct iommu_table_ops iommu_table_iobmap_ops = { + .set = iobmap_build, + .clear = iobmap_free +}; static void iommu_table_iobmap_setup(void) { @@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void) * Should probably be 8 (64 bytes) */ iommu_table_iobmap.it_blocksize = 4; + iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops; iommu_init_table(&iommu_table_iobmap, 0); pr_debug(" <- %s\n", __func__); } @@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void) pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi; pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi; - ppc_md.tce_build = iobmap_build; - ppc_md.tce_free = iobmap_free; set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 4b044d8cb49a..604190cab522 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -19,3 +19,10 @@ config PPC_POWERNV select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL default y + +config OPAL_PRD + tristate 'OPAL PRD driver' + depends on PPC_POWERNV + help + This enables the opal-prd driver, a facility to run processor + recovery diagnostics on OpenPower machines diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 26bd7e417b7c..1c8cdb6250e7 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o +obj-$(CONFIG_OPAL_PRD) += opal-prd.o diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index bd39a120bd60..59d735d2e5c0 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -17,6 +17,7 @@ #include <linux/cpu.h> #include <asm/firmware.h> +#include <asm/machdep.h> #include <asm/opal.h> #include <asm/cputhreads.h> #include <asm/cpuidle.h> @@ -289,5 +290,4 @@ out_free: out: return 0; } - -subsys_initcall(pnv_init_idle_states); +machine_subsys_initcall(powernv, pnv_init_idle_states); diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 841135f48981..e2e7d75f52f3 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -231,6 +231,7 @@ out: of_node_put(opal_node); return rc; } +machine_arch_initcall(powernv, opal_event_init); /** * opal_event_request(unsigned int opal_event_nr) - Request an event @@ -244,6 +245,9 @@ out: */ int opal_event_request(unsigned int opal_event_nr) { + if (WARN_ON_ONCE(!opal_event_irqchip.domain)) + return NO_IRQ; + return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr); } EXPORT_SYMBOL(opal_event_request); diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c new file mode 100644 index 000000000000..46cb3feb0a13 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -0,0 +1,449 @@ +/* + * OPAL Runtime Diagnostics interface driver + * Supported on POWERNV platform + * + * Copyright IBM Corporation 2015 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "opal-prd: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/poll.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <asm/opal-prd.h> +#include <asm/opal.h> +#include <asm/io.h> +#include <asm/uaccess.h> + + +/** + * The msg member must be at the end of the struct, as it's followed by the + * message data. + */ +struct opal_prd_msg_queue_item { + struct list_head list; + struct opal_prd_msg_header msg; +}; + +static struct device_node *prd_node; +static LIST_HEAD(opal_prd_msg_queue); +static DEFINE_SPINLOCK(opal_prd_msg_queue_lock); +static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait); +static atomic_t prd_usage; + +static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size) +{ + struct device_node *parent, *node; + bool found; + + if (addr + size < addr) + return false; + + parent = of_find_node_by_path("/reserved-memory"); + if (!parent) + return false; + + found = false; + + for_each_child_of_node(parent, node) { + uint64_t range_addr, range_size, range_end; + const __be32 *addrp; + const char *label; + + addrp = of_get_address(node, 0, &range_size, NULL); + + range_addr = of_read_number(addrp, 2); + range_end = range_addr + range_size; + + label = of_get_property(node, "ibm,prd-label", NULL); + + /* PRD ranges need a label */ + if (!label) + continue; + + if (range_end <= range_addr) + continue; + + if (addr >= range_addr && addr + size <= range_end) { + found = true; + of_node_put(node); + break; + } + } + + of_node_put(parent); + return found; +} + +static int opal_prd_open(struct inode *inode, struct file *file) +{ + /* + * Prevent multiple (separate) processes from concurrent interactions + * with the FW PRD channel + */ + if (atomic_xchg(&prd_usage, 1) == 1) + return -EBUSY; + + return 0; +} + +/* + * opal_prd_mmap - maps firmware-provided ranges into userspace + * @file: file structure for the device + * @vma: VMA to map the registers into + */ + +static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t addr, size; + int rc; + + pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_flags); + + addr = vma->vm_pgoff << PAGE_SHIFT; + size = vma->vm_end - vma->vm_start; + + /* ensure we're mapping within one of the allowable ranges */ + if (!opal_prd_range_is_valid(addr, size)) + return -EINVAL; + + vma->vm_page_prot = __pgprot(pgprot_val(phys_mem_access_prot(file, + vma->vm_pgoff, + size, vma->vm_page_prot)) + | _PAGE_SPECIAL); + + rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, + vma->vm_page_prot); + + return rc; +} + +static bool opal_msg_queue_empty(void) +{ + unsigned long flags; + bool ret; + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + ret = list_empty(&opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + return ret; +} + +static unsigned int opal_prd_poll(struct file *file, + struct poll_table_struct *wait) +{ + poll_wait(file, &opal_prd_msg_wait, wait); + + if (!opal_msg_queue_empty()) + return POLLIN | POLLRDNORM; + + return 0; +} + +static ssize_t opal_prd_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct opal_prd_msg_queue_item *item; + unsigned long flags; + ssize_t size, err; + int rc; + + /* we need at least a header's worth of data */ + if (count < sizeof(item->msg)) + return -EINVAL; + + if (*ppos) + return -ESPIPE; + + item = NULL; + + for (;;) { + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + if (!list_empty(&opal_prd_msg_queue)) { + item = list_first_entry(&opal_prd_msg_queue, + struct opal_prd_msg_queue_item, list); + list_del(&item->list); + } + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + if (item) + break; + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(opal_prd_msg_wait, + !opal_msg_queue_empty()); + if (rc) + return -EINTR; + } + + size = be16_to_cpu(item->msg.size); + if (size > count) { + err = -EINVAL; + goto err_requeue; + } + + rc = copy_to_user(buf, &item->msg, size); + if (rc) { + err = -EFAULT; + goto err_requeue; + } + + kfree(item); + + return size; + +err_requeue: + /* eep! re-queue at the head of the list */ + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + list_add(&item->list, &opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + return err; +} + +static ssize_t opal_prd_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct opal_prd_msg_header hdr; + ssize_t size; + void *msg; + int rc; + + size = sizeof(hdr); + + if (count < size) + return -EINVAL; + + /* grab the header */ + rc = copy_from_user(&hdr, buf, sizeof(hdr)); + if (rc) + return -EFAULT; + + size = be16_to_cpu(hdr.size); + + msg = kmalloc(size, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + rc = copy_from_user(msg, buf, size); + if (rc) { + size = -EFAULT; + goto out_free; + } + + rc = opal_prd_msg(msg); + if (rc) { + pr_warn("write: opal_prd_msg returned %d\n", rc); + size = -EIO; + } + +out_free: + kfree(msg); + + return size; +} + +static int opal_prd_release(struct inode *inode, struct file *file) +{ + struct opal_prd_msg_header msg; + + msg.size = cpu_to_be16(sizeof(msg)); + msg.type = OPAL_PRD_MSG_TYPE_FINI; + + opal_prd_msg((struct opal_prd_msg *)&msg); + + atomic_xchg(&prd_usage, 0); + + return 0; +} + +static long opal_prd_ioctl(struct file *file, unsigned int cmd, + unsigned long param) +{ + struct opal_prd_info info; + struct opal_prd_scom scom; + int rc = 0; + + switch (cmd) { + case OPAL_PRD_GET_INFO: + memset(&info, 0, sizeof(info)); + info.version = OPAL_PRD_KERNEL_VERSION; + rc = copy_to_user((void __user *)param, &info, sizeof(info)); + if (rc) + return -EFAULT; + break; + + case OPAL_PRD_SCOM_READ: + rc = copy_from_user(&scom, (void __user *)param, sizeof(scom)); + if (rc) + return -EFAULT; + + scom.rc = opal_xscom_read(scom.chip, scom.addr, + (__be64 *)&scom.data); + scom.data = be64_to_cpu(scom.data); + pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n", + scom.chip, scom.addr, scom.data, scom.rc); + + rc = copy_to_user((void __user *)param, &scom, sizeof(scom)); + if (rc) + return -EFAULT; + break; + + case OPAL_PRD_SCOM_WRITE: + rc = copy_from_user(&scom, (void __user *)param, sizeof(scom)); + if (rc) + return -EFAULT; + + scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data); + pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n", + scom.chip, scom.addr, scom.data, scom.rc); + + rc = copy_to_user((void __user *)param, &scom, sizeof(scom)); + if (rc) + return -EFAULT; + break; + + default: + rc = -EINVAL; + } + + return rc; +} + +static const struct file_operations opal_prd_fops = { + .open = opal_prd_open, + .mmap = opal_prd_mmap, + .poll = opal_prd_poll, + .read = opal_prd_read, + .write = opal_prd_write, + .unlocked_ioctl = opal_prd_ioctl, + .release = opal_prd_release, + .owner = THIS_MODULE, +}; + +static struct miscdevice opal_prd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "opal-prd", + .fops = &opal_prd_fops, +}; + +/* opal interface */ +static int opal_prd_msg_notifier(struct notifier_block *nb, + unsigned long msg_type, void *_msg) +{ + struct opal_prd_msg_queue_item *item; + struct opal_prd_msg_header *hdr; + struct opal_msg *msg = _msg; + int msg_size, item_size; + unsigned long flags; + + if (msg_type != OPAL_MSG_PRD) + return 0; + + /* Calculate total size of the message and item we need to store. The + * 'size' field in the header includes the header itself. */ + hdr = (void *)msg->params; + msg_size = be16_to_cpu(hdr->size); + item_size = msg_size + sizeof(*item) - sizeof(item->msg); + + item = kzalloc(item_size, GFP_ATOMIC); + if (!item) + return -ENOMEM; + + memcpy(&item->msg, msg->params, msg_size); + + spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); + list_add_tail(&item->list, &opal_prd_msg_queue); + spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags); + + wake_up_interruptible(&opal_prd_msg_wait); + + return 0; +} + +static struct notifier_block opal_prd_event_nb = { + .notifier_call = opal_prd_msg_notifier, + .next = NULL, + .priority = 0, +}; + +static int opal_prd_probe(struct platform_device *pdev) +{ + int rc; + + if (!pdev || !pdev->dev.of_node) + return -ENODEV; + + /* We should only have one prd driver instance per machine; ensure + * that we only get a valid probe on a single OF node. + */ + if (prd_node) + return -EBUSY; + + prd_node = pdev->dev.of_node; + + rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb); + if (rc) { + pr_err("Couldn't register event notifier\n"); + return rc; + } + + rc = misc_register(&opal_prd_dev); + if (rc) { + pr_err("failed to register miscdev\n"); + opal_message_notifier_unregister(OPAL_MSG_PRD, + &opal_prd_event_nb); + return rc; + } + + return 0; +} + +static int opal_prd_remove(struct platform_device *pdev) +{ + misc_deregister(&opal_prd_dev); + opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + return 0; +} + +static const struct of_device_id opal_prd_match[] = { + { .compatible = "ibm,opal-prd" }, + { }, +}; + +static struct platform_driver opal_prd_driver = { + .driver = { + .name = "opal-prd", + .owner = THIS_MODULE, + .of_match_table = opal_prd_match, + }, + .probe = opal_prd_probe, + .remove = opal_prd_remove, +}; + +module_platform_driver(opal_prd_driver); + +MODULE_DEVICE_TABLE(of, opal_prd_match); +MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 2e52b47393e7..afe66c576a38 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -55,8 +55,10 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) } ret = opal_get_param(token, param_id, (u64)buffer, length); - if (ret != OPAL_ASYNC_COMPLETION) + if (ret != OPAL_ASYNC_COMPLETION) { + ret = opal_error_code(ret); goto out_token; + } ret = opal_async_wait_response(token, &msg); if (ret) { @@ -65,7 +67,7 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = be64_to_cpu(msg.params[1]); + ret = opal_error_code(be64_to_cpu(msg.params[1])); out_token: opal_async_release_token(token); @@ -89,8 +91,10 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) ret = opal_set_param(token, param_id, (u64)buffer, length); - if (ret != OPAL_ASYNC_COMPLETION) + if (ret != OPAL_ASYNC_COMPLETION) { + ret = opal_error_code(ret); goto out_token; + } ret = opal_async_wait_response(token, &msg); if (ret) { @@ -99,7 +103,7 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = be64_to_cpu(msg.params[1]); + ret = opal_error_code(be64_to_cpu(msg.params[1])); out_token: opal_async_release_token(token); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index bf15ead00e12..d6a7b8252e4d 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -296,3 +296,4 @@ OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 8403307c5362..f084afa0e3ba 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -235,6 +235,7 @@ int opal_message_notifier_register(enum opal_msg_type msg_type, return atomic_notifier_chain_register( &opal_msg_notifier_head[msg_type], nb); } +EXPORT_SYMBOL_GPL(opal_message_notifier_register); int opal_message_notifier_unregister(enum opal_msg_type msg_type, struct notifier_block *nb) @@ -242,6 +243,7 @@ int opal_message_notifier_unregister(enum opal_msg_type msg_type, return atomic_notifier_chain_unregister( &opal_msg_notifier_head[msg_type], nb); } +EXPORT_SYMBOL_GPL(opal_message_notifier_unregister); static void opal_message_do_notify(uint32_t msg_type, void *msg) { @@ -600,21 +602,13 @@ static void __init opal_dump_region_init(void) "rc = %d\n", rc); } -static void opal_flash_init(struct device_node *opal_node) +static void opal_pdev_init(struct device_node *opal_node, + const char *compatible) { struct device_node *np; for_each_child_of_node(opal_node, np) - if (of_device_is_compatible(np, "ibm,opal-flash")) - of_platform_device_create(np, NULL, NULL); -} - -static void opal_ipmi_init(struct device_node *opal_node) -{ - struct device_node *np; - - for_each_child_of_node(opal_node, np) - if (of_device_is_compatible(np, "ibm,opal-ipmi")) + if (of_device_is_compatible(np, compatible)) of_platform_device_create(np, NULL, NULL); } @@ -663,9 +657,6 @@ static int __init opal_init(void) return -ENODEV; } - /* Initialise OPAL events */ - opal_event_init(); - /* Register OPAL consoles if any ports */ if (firmware_has_feature(FW_FEATURE_OPALv2)) consoles = of_find_node_by_path("/ibm,opal/consoles"); @@ -717,10 +708,10 @@ static int __init opal_init(void) opal_msglog_init(); } - /* Initialize OPAL IPMI backend */ - opal_ipmi_init(opal_node); - - opal_flash_init(opal_node); + /* Initialize platform devices: IPMI backend, PRD & flash interface */ + opal_pdev_init(opal_node, "ibm,opal-ipmi"); + opal_pdev_init(opal_node, "ibm,opal-flash"); + opal_pdev_init(opal_node, "ibm,opal-prd"); return 0; } @@ -752,11 +743,14 @@ void opal_shutdown(void) /* Export this so that test modules can use it */ EXPORT_SYMBOL_GPL(opal_invalid_call); +EXPORT_SYMBOL_GPL(opal_xscom_read); +EXPORT_SYMBOL_GPL(opal_xscom_write); EXPORT_SYMBOL_GPL(opal_ipmi_send); EXPORT_SYMBOL_GPL(opal_ipmi_recv); EXPORT_SYMBOL_GPL(opal_flash_read); EXPORT_SYMBOL_GPL(opal_flash_write); EXPORT_SYMBOL_GPL(opal_flash_erase); +EXPORT_SYMBOL_GPL(opal_prd_msg); /* Convert a region of vmalloc memory to an opal sg list */ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, @@ -830,6 +824,7 @@ int opal_error_code(int rc) case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; + case OPAL_PERMISSION: return -EPERM; case OPAL_UNSUPPORTED: return -EIO; case OPAL_HARDWARE: return -EIO; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 508f530e367b..5738d315248b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -23,6 +23,9 @@ #include <linux/io.h> #include <linux/msi.h> #include <linux/memblock.h> +#include <linux/iommu.h> +#include <linux/rculist.h> +#include <linux/sizes.h> #include <asm/sections.h> #include <asm/io.h> @@ -38,8 +41,9 @@ #include <asm/debug.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> +#include <asm/mmzone.h> -#include <misc/cxl.h> +#include <misc/cxl-base.h> #include "powernv.h" #include "pci.h" @@ -47,6 +51,11 @@ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 + +static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); + static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { @@ -1086,10 +1095,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all) return; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Associate it with all child devices */ pnv_ioda_setup_same_PE(bus, pe); @@ -1283,36 +1288,27 @@ m64_failed: return -EBUSY; } +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num); +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); + static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) { - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; struct iommu_table *tbl; - unsigned long addr; int64_t rc; - bus = dev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - tbl = pe->tce32_table; - addr = tbl->it_base; - - opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(addr), - 0, 0x1000); - - rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, - pe->pe_number, - (pe->pe_number << 1) + 1, - pe->tce_bypass_base, - 0); + tbl = pe->table_group.tables[0]; + rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) pe_warn(pe, "OPAL error %ld release DMA window\n", rc); + pnv_pci_ioda2_set_bypass(pe, false); + if (pe->table_group.group) { + iommu_group_put(pe->table_group.group); + BUG_ON(pe->table_group.group); + } + pnv_pci_ioda2_table_free_pages(tbl); iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); - free_pages(addr, get_order(TCE32_TABLE_SIZE)); - pe->tce32_table = NULL; } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs) @@ -1460,10 +1456,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) continue; } - pe->tce32_table = kzalloc_node(sizeof(struct iommu_table), - GFP_KERNEL, hose->node); - pe->tce32_table->data = pe; - /* Put PE to the list */ mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); @@ -1598,7 +1590,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); + /* + * Note: iommu_add_device() will fail here as + * for physical PE: the device is already added by now; + * for virtual PE: sysfs entries are not ready yet and + * tce_iommu_bus_notifier will add the device to a group later. + */ } static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) @@ -1626,7 +1624,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) } else { dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(&pdev->dev, &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, pe->tce32_table); + set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); } *pdev->dev.dma_mask = dma_mask; return 0; @@ -1655,36 +1653,36 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb, } static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus, - bool add_to_iommu_group) + struct pci_bus *bus) { struct pci_dev *dev; list_for_each_entry(dev, &bus->devices, bus_list) { - if (add_to_iommu_group) - set_iommu_table_base_and_group(&dev->dev, - pe->tce32_table); - else - set_iommu_table_base(&dev->dev, pe->tce32_table); + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); + iommu_add_device(&dev->dev); - if (dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate, - add_to_iommu_group); + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); } } -static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, - struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) { + struct iommu_table_group_link *tgl = list_first_entry_or_null( + &tbl->it_group_list, struct iommu_table_group_link, + next); + struct pnv_ioda_pe *pe = container_of(tgl->table_group, + struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->tce_inval_reg_phys : - (__be64 __iomem *)tbl->it_index; + (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : + pe->phb->ioda.tce_inval_reg; unsigned long start, end, inc; const unsigned shift = tbl->it_page_shift; - start = __pa(startp); - end = __pa(endp); + start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); + end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + + npages - 1); /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ if (tbl->it_busno) { @@ -1720,26 +1718,79 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe, */ } -static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, - struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, + attrs); + + if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); + + return ret; +} + +#ifdef CONFIG_IOMMU_API +static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + +static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, + long npages) +{ + pnv_tce_free(tbl, index, npages); + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); +} + +static struct iommu_table_ops pnv_ioda1_iommu_ops = { + .set = pnv_ioda1_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda1_tce_xchg, +#endif + .clear = pnv_ioda1_tce_free, + .get = pnv_tce_get, +}; + +static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) +{ + /* 01xb - invalidate TCEs that match the specified PE# */ + unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); + struct pnv_phb *phb = pe->phb; + + if (!phb->ioda.tce_inval_reg) + return; + + mb(); /* Ensure above stores are visible */ + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); +} + +static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, + __be64 __iomem *invalidate, unsigned shift, + unsigned long index, unsigned long npages) { unsigned long start, end, inc; - __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->tce_inval_reg_phys : - (__be64 __iomem *)tbl->it_index; - const unsigned shift = tbl->it_page_shift; /* We'll invalidate DMA address in PE scope */ start = 0x2ull << 60; - start |= (pe->pe_number & 0xFF); + start |= (pe_number & 0xFF); end = start; /* Figure out the start, end and step */ - inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64)); - start |= (inc << shift); - inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64)); - end |= (inc << shift); + start |= (index << shift); + end |= ((index + npages - 1) << shift); inc = (0x1ull << shift); mb(); @@ -1752,25 +1803,83 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe, } } -void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm) +static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, + unsigned long index, unsigned long npages, bool rm) { - struct pnv_ioda_pe *pe = tbl->data; - struct pnv_phb *phb = pe->phb; + struct iommu_table_group_link *tgl; - if (phb->type == PNV_PHB_IODA1) - pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm); - else - pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm); + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + struct pnv_ioda_pe *pe = container_of(tgl->table_group, + struct pnv_ioda_pe, table_group); + __be64 __iomem *invalidate = rm ? + (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : + pe->phb->ioda.tce_inval_reg; + + pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, + invalidate, tbl->it_page_shift, + index, npages); + } +} + +static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, + attrs); + + if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) + pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + + return ret; +} + +#ifdef CONFIG_IOMMU_API +static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret && (tbl->it_type & + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); + + return ret; +} +#endif + +static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, + long npages) +{ + pnv_tce_free(tbl, index, npages); + + if (tbl->it_type & TCE_PCI_SWINV_FREE) + pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); +} + +static void pnv_ioda2_table_free(struct iommu_table *tbl) +{ + pnv_pci_ioda2_table_free_pages(tbl); + iommu_free_table(tbl, "pnv"); } +static struct iommu_table_ops pnv_ioda2_iommu_ops = { + .set = pnv_ioda2_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_ioda2_tce_xchg, +#endif + .clear = pnv_ioda2_tce_free, + .get = pnv_tce_get, + .free = pnv_ioda2_table_free, +}; + static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, unsigned int segs) { struct page *tce_mem = NULL; - const __be64 *swinvp; struct iommu_table *tbl; unsigned int i; int64_t rc; @@ -1784,6 +1893,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, if (WARN_ON(pe->tce32_seg >= 0)) return; + tbl = pnv_pci_table_alloc(phb->hose->node); + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); + pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); + /* Grab a 32-bit TCE table */ pe->tce32_seg = base; pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", @@ -1818,39 +1932,30 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - tbl = pe->tce32_table; pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, base << 28, IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data - * to or. Since the bus is only printed out on table free - * errors, and on the first pass the data will be a relative - * bus number, print that out instead. - */ - pe->tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, - 8); + if (phb->ioda.tce_inval_reg) tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE | TCE_PCI_SWINV_PAIR); - } + + tbl->it_ops = &pnv_ioda1_iommu_ops; + pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; + pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; iommu_init_table(tbl, phb->hose->node); if (pe->flags & PNV_IODA_PE_DEV) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); - } else if (pe->flags & PNV_IODA_PE_VF) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - } + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + set_iommu_table_base(&pe->pdev->dev, tbl); + iommu_add_device(&pe->pdev->dev); + } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + pnv_ioda_setup_bus_dma(pe, pe->pbus); return; fail: @@ -1859,11 +1964,53 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + if (tbl) { + pnv_pci_unlink_table_and_group(tbl, &pe->table_group); + iommu_free_table(tbl, "pnv"); + } } -static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) +static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + + pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); + + /* + * Map TCE table through TVT. The TVE index is the PE number + * shifted by 1 bit for 32-bits DMA space. + */ + rc = opal_pci_map_pe_dma_window(phb->opal_id, + pe->pe_number, + (pe->pe_number << 1) + num, + tbl->it_indirect_levels + 1, + __pa(tbl->it_base), + size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + return rc; + } + + pnv_pci_link_table_and_group(phb->hose->node, num, + tbl, &pe->table_group); + pnv_pci_ioda2_tce_invalidate_entire(pe); + + return 0; +} + +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { - struct pnv_ioda_pe *pe = tbl->data; uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -1883,17 +2030,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) window_id, pe->tce_bypass_base, 0); - - /* - * EEH needs the mapping between IOMMU table and group - * of those VFIO/KVM pass-through devices. We can postpone - * resetting DMA ops until the DMA mask is configured in - * host side. - */ - if (pe->pdev) - set_iommu_table_base(&pe->pdev->dev, tbl); - else - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); } if (rc) pe_err(pe, "OPAL error %lld configuring bypass window\n", rc); @@ -1901,106 +2037,363 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable) pe->tce_bypass_enabled = enable; } -static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); + +static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) { - /* TVE #1 is selected by PCI address bit 59 */ - pe->tce_bypass_base = 1ull << 59; + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + int nid = pe->phb->hose->node; + __u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start; + long ret; + struct iommu_table *tbl; + + tbl = pnv_pci_table_alloc(nid); + if (!tbl) + return -ENOMEM; - /* Install set_bypass callback for VFIO */ - pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass; + ret = pnv_pci_ioda2_table_alloc_pages(nid, + bus_offset, page_shift, window_size, + levels, tbl); + if (ret) { + iommu_free_table(tbl, "pnv"); + return ret; + } - /* Enable bypass by default */ - pnv_pci_ioda2_set_bypass(pe->tce32_table, true); + tbl->it_ops = &pnv_ioda2_iommu_ops; + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + *ptbl = tbl; + + return 0; } -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) +{ + struct iommu_table *tbl = NULL; + long rc; + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, + IOMMU_PAGE_SHIFT_4K, + pe->table_group.tce32_size, + POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + if (rc) { + pe_err(pe, "Failed to create 32-bit TCE table, err %ld", + rc); + return rc; + } + + iommu_init_table(tbl, pe->phb->hose->node); + + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + if (rc) { + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", + rc); + pnv_ioda2_table_free(tbl); + return rc; + } + + if (!pnv_iommu_bypass_disabled) + pnv_pci_ioda2_set_bypass(pe, true); + + /* OPAL variant of PHB3 invalidated TCEs */ + if (pe->phb->ioda.tce_inval_reg) + tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + + /* + * Setting table base here only for carrying iommu_group + * further down to let iommu_add_device() do the job. + * pnv_pci_ioda_dma_dev_setup will override it later anyway. + */ + if (pe->flags & PNV_IODA_PE_DEV) + set_iommu_table_base(&pe->pdev->dev, tbl); + + return 0; +} + +#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) +static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, + int num) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + struct pnv_phb *phb = pe->phb; + long ret; + + pe_info(pe, "Removing DMA window #%d\n", num); + + ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + (pe->pe_number << 1) + num, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (ret) + pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); + else + pnv_pci_ioda2_tce_invalidate_entire(pe); + + pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); + + return ret; +} +#endif + +#ifdef CONFIG_IOMMU_API +static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, + __u64 window_size, __u32 levels) +{ + unsigned long bytes = 0; + const unsigned window_shift = ilog2(window_size); + unsigned entries_shift = window_shift - page_shift; + unsigned table_shift = entries_shift + 3; + unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift); + unsigned long direct_table_size; + + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) || + (window_size > memory_hotplug_max()) || + !is_power_of_2(window_size)) + return 0; + + /* Calculate a direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + table_shift = entries_shift + 3; + table_shift = max_t(unsigned, table_shift, PAGE_SHIFT); + direct_table_size = 1UL << table_shift; + + for ( ; levels; --levels) { + bytes += _ALIGN_UP(tce_table_size, direct_table_size); + + tce_table_size /= direct_table_size; + tce_table_size <<= 3; + tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size); + } + + return bytes; +} + +static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ + struct iommu_table *tbl = pe->table_group.tables[0]; + + pnv_pci_ioda2_set_bypass(pe, false); + pnv_pci_ioda2_unset_window(&pe->table_group, 0); + pnv_ioda2_table_free(tbl); +} + +static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, + table_group); + + pnv_pci_ioda2_setup_default_config(pe); +} + +static struct iommu_table_group_ops pnv_pci_ioda2_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_set_window, + .unset_window = pnv_pci_ioda2_unset_window, + .take_ownership = pnv_ioda2_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; +#endif + +static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) { - struct page *tce_mem = NULL; - void *addr; const __be64 *swinvp; - struct iommu_table *tbl; - unsigned int tce_table_size, end; - int64_t rc; - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) + /* OPAL variant of PHB3 invalidated TCEs */ + swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); + if (!swinvp) return; - /* The PE will reserve all possible 32-bits space */ - pe->tce32_seg = 0; - end = (1 << ilog2(phb->ioda.m32_pci_base)); - tce_table_size = (end / 0x1000) * 8; - pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", - end); + phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp); + phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8); +} - /* Allocate TCE table */ - tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(tce_table_size)); +static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, + unsigned levels, unsigned long limit, + unsigned long *current_offset) +{ + struct page *tce_mem = NULL; + __be64 *addr, *tmp; + unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; + unsigned long allocated = 1UL << (order + PAGE_SHIFT); + unsigned entries = 1UL << (shift - 3); + long i; + + tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); if (!tce_mem) { - pe_err(pe, "Failed to allocate a 32-bit TCE memory\n"); - goto fail; + pr_err("Failed to allocate a TCE memory, order=%d\n", order); + return NULL; } addr = page_address(tce_mem); - memset(addr, 0, tce_table_size); + memset(addr, 0, allocated); + + --levels; + if (!levels) { + *current_offset += allocated; + return addr; + } + + for (i = 0; i < entries; ++i) { + tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, + levels, limit, current_offset); + if (!tmp) + break; + + addr[i] = cpu_to_be64(__pa(tmp) | + TCE_PCI_READ | TCE_PCI_WRITE); + + if (*current_offset >= limit) + break; + } + + return addr; +} + +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned level); + +static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl) +{ + void *addr; + unsigned long offset = 0, level_shift; + const unsigned window_shift = ilog2(window_size); + unsigned entries_shift = window_shift - page_shift; + unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); + const unsigned long tce_table_size = 1UL << table_shift; + + if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) + return -EINVAL; + + if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + return -EINVAL; + + /* Adjust direct table size from window_size and levels */ + entries_shift = (entries_shift + levels - 1) / levels; + level_shift = entries_shift + 3; + level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); + + /* Allocate TCE table */ + addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset); + + /* addr==NULL means that the first level allocation failed */ + if (!addr) + return -ENOMEM; /* - * Map TCE table through TVT. The TVE index is the PE number - * shifted by 1 bit for 32-bits DMA space. + * First level was allocated but some lower level failed as + * we did not allocate as much as we wanted, + * release partially allocated table. */ - rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - pe->pe_number << 1, 1, __pa(addr), - tce_table_size, 0x1000); - if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table," - " err %ld\n", rc); - goto fail; + if (offset < tce_table_size) { + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + return -ENOMEM; } /* Setup linux iommu table */ - tbl = pe->tce32_table; - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0, - IOMMU_PAGE_SHIFT_4K); + pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, + page_shift); + tbl->it_level_size = 1ULL << (level_shift - 3); + tbl->it_indirect_levels = levels - 1; + tbl->it_allocated_size = offset; - /* OPAL variant of PHB3 invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (swinvp) { - /* We need a couple more fields -- an address and a data - * to or. Since the bus is only printed out on table free - * errors, and on the first pass the data will be a relative - * bus number, print that out instead. - */ - pe->tce_inval_reg_phys = be64_to_cpup(swinvp); - tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys, - 8); - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); + pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", + window_size, tce_table_size, bus_offset); + + return 0; +} + +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned level) +{ + const unsigned long addr_ul = (unsigned long) addr & + ~(TCE_PCI_READ | TCE_PCI_WRITE); + + if (level) { + long i; + u64 *tmp = (u64 *) addr_ul; + + for (i = 0; i < size; ++i) { + unsigned long hpa = be64_to_cpu(tmp[i]); + + if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) + continue; + + pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, + level - 1); + } } - iommu_init_table(tbl, phb->hose->node); - if (pe->flags & PNV_IODA_PE_DEV) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - set_iommu_table_base_and_group(&pe->pdev->dev, tbl); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); - } else if (pe->flags & PNV_IODA_PE_VF) { - iommu_register_group(tbl, phb->hose->global_number, - pe->pe_number); - } - - /* Also create a bypass window */ - if (!pnv_iommu_bypass_disabled) - pnv_pci_ioda2_setup_bypass_pe(phb, pe); + free_pages(addr_ul, get_order(size << 3)); +} - return; -fail: - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; - if (tce_mem) - __free_pages(tce_mem, get_order(tce_table_size)); +static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) +{ + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + + if (!tbl->it_size) + return; + + pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, + tbl->it_indirect_levels); +} + +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) +{ + int64_t rc; + + /* We shouldn't already have a 32-bit DMA associated */ + if (WARN_ON(pe->tce32_seg >= 0)) + return; + + /* TVE #1 is selected by PCI address bit 59 */ + pe->tce_bypass_base = 1ull << 59; + + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); + + /* The PE will reserve all possible 32-bits space */ + pe->tce32_seg = 0; + pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", + phb->ioda.m32_pci_base); + + /* Setup linux iommu table */ + pe->table_group.tce32_start = 0; + pe->table_group.tce32_size = phb->ioda.m32_pci_base; + pe->table_group.max_dynamic_windows_supported = + IOMMU_TABLE_GROUP_MAX_TABLES; + pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; + pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; +#endif + + rc = pnv_pci_ioda2_setup_default_config(pe); + if (rc) { + if (pe->tce32_seg >= 0) + pe->tce32_seg = -1; + return; + } + + if (pe->flags & PNV_IODA_PE_DEV) + iommu_add_device(&pe->pdev->dev); + else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + pnv_ioda_setup_bus_dma(pe, pe->pbus); } static void pnv_ioda_setup_dma(struct pnv_phb *phb) @@ -2025,6 +2418,8 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) pr_info("PCI: %d PE# for a total weight of %d\n", phb->ioda.dma_pe_count, phb->ioda.dma_weight); + pnv_pci_ioda_setup_opal_tce_kill(phb); + /* Walk our PE list and configure their DMA segments, hand them * out one base segment plus any residual segments based on * weight @@ -2643,8 +3038,10 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; } -static void pnv_pci_ioda_shutdown(struct pnv_phb *phb) +static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { + struct pnv_phb *phb = hose->private_data; + opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); } @@ -2659,6 +3056,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, @@ -2806,9 +3204,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask; - /* Setup shutdown function for kexec */ - phb->shutdown = pnv_pci_ioda_shutdown; - /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c index 7f826e8cc1d4..f2bdfea3b68d 100644 --- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c +++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c @@ -83,16 +83,32 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { } #endif /* CONFIG_PCI_MSI */ +static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { + .set = pnv_tce_build, +#ifdef CONFIG_IOMMU_API + .exchange = pnv_tce_xchg, +#endif + .clear = pnv_tce_free, + .get = pnv_tce_get, +}; + static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) { - if (phb->p5ioc2.iommu_table.it_map == NULL) { - iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node); - iommu_register_group(&phb->p5ioc2.iommu_table, + struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0]; + + if (!tbl->it_map) { + tbl->it_ops = &pnv_p5ioc2_iommu_ops; + iommu_init_table(tbl, phb->hose->node); + iommu_register_group(&phb->p5ioc2.table_group, pci_domain_nr(phb->hose->bus), phb->opal_id); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + pnv_pci_link_table_and_group(phb->hose->node, 0, + tbl, &phb->p5ioc2.table_group); } - set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table); + set_iommu_table_base(&pdev->dev, tbl); + iommu_add_device(&pdev->dev); } static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = { @@ -111,6 +127,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, u64 phb_id; int64_t rc; static int primary = 1; + struct iommu_table_group *table_group; + struct iommu_table *tbl; pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name); @@ -180,6 +198,15 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id, pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table, tce_mem, tce_size, 0, IOMMU_PAGE_SHIFT_4K); + /* + * We do not allocate iommu_table as we do not support + * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table() + * should not be called for phb->p5ioc2.table_group.tables[0] ever. + */ + tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table; + table_group = &phb->p5ioc2.table_group; + table_group->tce32_start = tbl->it_offset << tbl->it_page_shift; + table_group->tce32_size = tbl->it_size << tbl->it_page_shift; } void __init pnv_pci_init_p5ioc2_hub(struct device_node *np) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 8a557b5aabf7..765d8ed558d0 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -572,80 +572,152 @@ struct pci_ops pnv_pci_ops = { .write = pnv_pci_write_config, }; -static int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs, bool rm) +static __be64 *pnv_tce(struct iommu_table *tbl, long idx) { - u64 proto_tce; - __be64 *tcep, *tces; - u64 rpn; - - proto_tce = TCE_PCI_READ; // Read allowed + __be64 *tmp = ((__be64 *)tbl->it_base); + int level = tbl->it_indirect_levels; + const long shift = ilog2(tbl->it_level_size); + unsigned long mask = (tbl->it_level_size - 1) << (level * shift); + + while (level) { + int n = (idx & mask) >> (level * shift); + unsigned long tce = be64_to_cpu(tmp[n]); + + tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); + idx &= ~mask; + mask >>= shift; + --level; + } - if (direction != DMA_TO_DEVICE) - proto_tce |= TCE_PCI_WRITE; + return tmp + idx; +} - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; - rpn = __pa(uaddr) >> tbl->it_page_shift; +int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + u64 proto_tce = iommu_direction_to_tce_perm(direction); + u64 rpn = __pa(uaddr) >> tbl->it_page_shift; + long i; - while (npages--) - *(tcep++) = cpu_to_be64(proto_tce | - (rpn++ << tbl->it_page_shift)); + for (i = 0; i < npages; i++) { + unsigned long newtce = proto_tce | + ((rpn + i) << tbl->it_page_shift); + unsigned long idx = index - tbl->it_offset + i; - /* Some implementations won't cache invalid TCEs and thus may not - * need that flush. We'll probably turn it_type into a bit mask - * of flags if that becomes the case - */ - if (tbl->it_type & TCE_PCI_SWINV_CREATE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); + *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); + } return 0; } -static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs) +#ifdef CONFIG_IOMMU_API +int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) { - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, - false); + u64 proto_tce = iommu_direction_to_tce_perm(*direction); + unsigned long newtce = *hpa | proto_tce, oldtce; + unsigned long idx = index - tbl->it_offset; + + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + + oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); + *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); + *direction = iommu_tce_direction(oldtce); + + return 0; } +#endif -static void pnv_tce_free(struct iommu_table *tbl, long index, long npages, - bool rm) +void pnv_tce_free(struct iommu_table *tbl, long index, long npages) { - __be64 *tcep, *tces; + long i; - tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset; + for (i = 0; i < npages; i++) { + unsigned long idx = index - tbl->it_offset + i; - while (npages--) - *(tcep++) = cpu_to_be64(0); + *(pnv_tce(tbl, idx)) = cpu_to_be64(0); + } +} - if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm); +unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +{ + return *(pnv_tce(tbl, index - tbl->it_offset)); } -static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages) +struct iommu_table *pnv_pci_table_alloc(int nid) { - pnv_tce_free(tbl, index, npages, false); + struct iommu_table *tbl; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + + return tbl; } -static unsigned long pnv_tce_get(struct iommu_table *tbl, long index) +long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group) { - return ((u64 *)tbl->it_base)[index - tbl->it_offset]; + struct iommu_table_group_link *tgl = NULL; + + if (WARN_ON(!tbl || !table_group)) + return -EINVAL; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + return -ENOMEM; + + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[num] = tbl; + + return 0; } -static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, - enum dma_data_direction direction, - struct dma_attrs *attrs) +static void pnv_iommu_table_group_link_free(struct rcu_head *head) { - return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true); + struct iommu_table_group_link *tgl = container_of(head, + struct iommu_table_group_link, rcu); + + kfree(tgl); } -static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages) +void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group) { - pnv_tce_free(tbl, index, npages, true); + long i; + bool found; + struct iommu_table_group_link *tgl; + + if (!tbl || !table_group) + return; + + /* Remove link to a group from table's list of attached groups */ + found = false; + list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + if (tgl->table_group == table_group) { + list_del_rcu(&tgl->next); + call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + found = true; + break; + } + } + if (WARN_ON(!found)) + return; + + /* Clean a pointer to iommu_table in iommu_table_group::tables[] */ + found = false; + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] == tbl) { + table_group->tables[i] = NULL; + found = true; + break; + } + } + WARN_ON(!found); } void pnv_pci_setup_iommu_table(struct iommu_table *tbl, @@ -704,12 +776,9 @@ void pnv_pci_shutdown(void) { struct pci_controller *hose; - list_for_each_entry(hose, &hose_list, list_node) { - struct pnv_phb *phb = hose->private_data; - - if (phb && phb->shutdown) - phb->shutdown(phb); - } + list_for_each_entry(hose, &hose_list, list_node) + if (hose->controller_ops.shutdown) + hose->controller_ops.shutdown(hose); } /* Fixup wrong class code in p7ioc and p8 root complex */ @@ -752,11 +821,6 @@ void __init pnv_pci_init(void) pci_devs_phb_init(); /* Configure IOMMU DMA hooks */ - ppc_md.tce_build = pnv_tce_build_vm; - ppc_md.tce_free = pnv_tce_free_vm; - ppc_md.tce_build_rm = pnv_tce_build_rm; - ppc_md.tce_free_rm = pnv_tce_free_rm; - ppc_md.tce_get = pnv_tce_get; set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index ac8686c853e6..8ef2d28aded0 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -57,8 +57,7 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ int tce32_seg; int tce32_segcount; - struct iommu_table *tce32_table; - phys_addr_t tce_inval_reg_phys; + struct iommu_table_group table_group; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; @@ -110,7 +109,6 @@ struct pnv_phb { struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); - void (*shutdown)(struct pnv_phb *phb); int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pnv_phb *phb); int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all); @@ -121,6 +119,7 @@ struct pnv_phb { union { struct { struct iommu_table iommu_table; + struct iommu_table_group table_group; } p5ioc2; struct { @@ -184,6 +183,12 @@ struct pnv_phb { * boot for resource allocation purposes */ struct list_head pe_dma_list; + + /* TCE cache invalidate registers (physical and + * remapped) + */ + phys_addr_t tce_inval_reg_phys; + __be64 __iomem *tce_inval_reg; } ioda; }; @@ -198,6 +203,13 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + struct dma_attrs *attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); @@ -205,6 +217,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, int where, int size, u32 *val); int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); +extern struct iommu_table *pnv_pci_table_alloc(int nid); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned page_shift); diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 61d5a17f45c0..10510dea16b3 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -36,6 +36,8 @@ #include <linux/crash_dump.h> #include <linux/memory.h> #include <linux/of.h> +#include <linux/iommu.h> +#include <linux/rculist.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -51,6 +53,73 @@ #include "pseries.h" +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group = NULL; + struct iommu_table *tbl = NULL; + struct iommu_table_group_link *tgl = NULL; + + table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, + node); + if (!table_group) + goto fail_exit; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); + if (!tbl) + goto fail_exit; + + tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, + node); + if (!tgl) + goto fail_exit; + + INIT_LIST_HEAD_RCU(&tbl->it_group_list); + tgl->table_group = table_group; + list_add_rcu(&tgl->next, &tbl->it_group_list); + + table_group->tables[0] = tbl; + + return table_group; + +fail_exit: + kfree(tgl); + kfree(table_group); + kfree(tbl); + + return NULL; +} + +static void iommu_pseries_free_group(struct iommu_table_group *table_group, + const char *node_name) +{ + struct iommu_table *tbl; +#ifdef CONFIG_IOMMU_API + struct iommu_table_group_link *tgl; +#endif + + if (!table_group) + return; + + tbl = table_group->tables[0]; +#ifdef CONFIG_IOMMU_API + tgl = list_first_entry_or_null(&tbl->it_group_list, + struct iommu_table_group_link, next); + + WARN_ON_ONCE(!tgl); + if (tgl) { + list_del_rcu(&tgl->next); + kfree(tgl); + } + if (table_group->group) { + iommu_group_put(table_group->group); + BUG_ON(table_group->group); + } +#endif + iommu_free_table(tbl, node_name); + + kfree(table_group); +} + static void tce_invalidate_pSeries_sw(struct iommu_table *tbl, __be64 *startp, __be64 *endp) { @@ -193,7 +262,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, int ret = 0; unsigned long flags; - if (npages == 1) { + if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } @@ -285,6 +354,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n { u64 rc; + if (!firmware_has_feature(FW_FEATURE_MULTITCE)) + return tce_free_pSeriesLP(tbl, tcenum, npages); + rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); if (rc && printk_ratelimit()) { @@ -460,7 +532,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } - #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -546,6 +617,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, tbl->it_size = size >> tbl->it_page_shift; } +struct iommu_table_ops iommu_table_pseries_ops = { + .set = tce_build_pSeries, + .clear = tce_free_pSeries, + .get = tce_get_pseries +}; + static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) { struct device_node *dn; @@ -610,12 +687,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pci->phb->dma_window_size = 0x8000000ul; pci->phb->dma_window_base_cur = 0x8000000ul; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); + tbl->it_ops = &iommu_table_pseries_ops; + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, pci_domain_nr(bus), 0); /* Divide the rest (1.75GB) among the children */ pci->phb->dma_window_size = 0x80000000ul; @@ -625,6 +703,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); } +struct iommu_table_ops iommu_table_lpar_multi_ops = { + .set = tce_buildmulti_pSeriesLP, + .clear = tce_freemulti_pSeriesLP, + .get = tce_get_pSeriesLP +}; static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) { @@ -653,15 +736,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) ppci = PCI_DN(pdn); pr_debug(" parent is %s, iommu_table: 0x%p\n", - pdn->full_name, ppci->iommu_table); + pdn->full_name, ppci->table_group); - if (!ppci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - ppci->phb->node); + if (!ppci->table_group) { + ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); + tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); - ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); - iommu_register_group(tbl, pci_domain_nr(bus), 0); - pr_debug(" created table: %p\n", ppci->iommu_table); + tbl->it_ops = &iommu_table_lpar_multi_ops; + iommu_init_table(tbl, ppci->phb->node); + iommu_register_group(ppci->table_group, + pci_domain_nr(bus), 0); + pr_debug(" created table: %p\n", ppci->table_group); } } @@ -683,13 +768,15 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) struct pci_controller *phb = PCI_DN(dn)->phb; pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - phb->node); + PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); + tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); - PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); - iommu_register_group(tbl, pci_domain_nr(phb->bus), 0); - set_iommu_table_base_and_group(&dev->dev, - PCI_DN(dn)->iommu_table); + tbl->it_ops = &iommu_table_pseries_ops; + iommu_init_table(tbl, phb->node); + iommu_register_group(PCI_DN(dn)->table_group, + pci_domain_nr(phb->bus), 0); + set_iommu_table_base(&dev->dev, tbl); + iommu_add_device(&dev->dev); return; } @@ -697,13 +784,14 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) * an already allocated iommu table is found and use that. */ - while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) + while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL) dn = dn->parent; - if (dn && PCI_DN(dn)) - set_iommu_table_base_and_group(&dev->dev, - PCI_DN(dn)->iommu_table); - else + if (dn && PCI_DN(dn)) { + set_iommu_table_base(&dev->dev, + PCI_DN(dn)->table_group->tables[0]); + iommu_add_device(&dev->dev); + } else printk(KERN_WARNING "iommu: Device %s has no iommu table\n", pci_name(dev)); } @@ -1088,7 +1176,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) dn = pci_device_to_OF_node(dev); pr_debug(" node is %s\n", dn->full_name); - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1104,18 +1192,21 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) pr_debug(" parent is %s\n", pdn->full_name); pci = PCI_DN(pdn); - if (!pci->iommu_table) { - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, - pci->phb->node); + if (!pci->table_group) { + pci->table_group = iommu_pseries_alloc_group(pci->phb->node); + tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); - pci->iommu_table = iommu_init_table(tbl, pci->phb->node); - iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0); - pr_debug(" created table: %p\n", pci->iommu_table); + tbl->it_ops = &iommu_table_lpar_multi_ops; + iommu_init_table(tbl, pci->phb->node); + iommu_register_group(pci->table_group, + pci_domain_nr(pci->phb->bus), 0); + pr_debug(" created table: %p\n", pci->table_group); } else { - pr_debug(" found DMA window, table: %p\n", pci->iommu_table); + pr_debug(" found DMA window, table: %p\n", pci->table_group); } - set_iommu_table_base_and_group(&dev->dev, pci->iommu_table); + set_iommu_table_base(&dev->dev, pci->table_group->tables[0]); + iommu_add_device(&dev->dev); } static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) @@ -1145,7 +1236,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) * search upwards in the tree until we either hit a dma-window * property, OR find a parent with a table already allocated. */ - for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window) @@ -1189,7 +1280,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev) dn = pci_device_to_OF_node(pdev); /* search upwards for ibm,dma-window */ - for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table; + for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group; dn = dn->parent) if (of_get_property(dn, "ibm,dma-window", NULL)) break; @@ -1269,8 +1360,9 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * the device node. */ remove_ddw(np, false); - if (pci && pci->iommu_table) - iommu_free_table(pci->iommu_table, np->full_name); + if (pci && pci->table_group) + iommu_pseries_free_group(pci->table_group, + np->full_name); spin_lock(&direct_window_list_lock); list_for_each_entry(window, &direct_window_list, list) { @@ -1300,22 +1392,11 @@ void iommu_init_early_pSeries(void) return; if (firmware_has_feature(FW_FEATURE_LPAR)) { - if (firmware_has_feature(FW_FEATURE_MULTITCE)) { - ppc_md.tce_build = tce_buildmulti_pSeriesLP; - ppc_md.tce_free = tce_freemulti_pSeriesLP; - } else { - ppc_md.tce_build = tce_build_pSeriesLP; - ppc_md.tce_free = tce_free_pSeriesLP; - } - ppc_md.tce_get = tce_get_pSeriesLP; pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP; ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP; } else { - ppc_md.tce_build = tce_build_pSeries; - ppc_md.tce_free = tce_free_pSeries; - ppc_md.tce_get = tce_get_pseries; pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries; pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries; } @@ -1333,8 +1414,6 @@ static int __init disable_multitce(char *str) firmware_has_feature(FW_FEATURE_LPAR) && firmware_has_feature(FW_FEATURE_MULTITCE)) { printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); - ppc_md.tce_build = tce_build_pSeriesLP; - ppc_md.tce_free = tce_free_pSeriesLP; powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; } return 1; diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index d00a5663e312..90bcdfeedf48 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node) return 0; } +static struct iommu_table_ops iommu_dart_ops = { + .set = dart_build, + .clear = dart_free, + .flush = dart_flush, +}; + static void iommu_table_dart_setup(void) { iommu_table_dart.it_busno = 0; @@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void) iommu_table_dart.it_base = (unsigned long)dart_vbase; iommu_table_dart.it_index = 0; iommu_table_dart.it_blocksize = 1; + iommu_table_dart.it_ops = &iommu_dart_ops; iommu_init_table(&iommu_table_dart, -1); /* Reserve the last page of the DART to avoid possible prefetch @@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops) if (dart_init(dn) != 0) goto bail; - /* Setup low level TCE operations for the core IOMMU code */ - ppc_md.tce_build = dart_build; - ppc_md.tce_free = dart_free; - ppc_md.tce_flush = dart_flush; - /* Setup bypass if supported */ if (dart_is_u4) ppc_md.dma_set_mask = dart_dma_set_mask; diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c index 1cd057f11725..d77345338671 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/sysdev/uic.c @@ -198,7 +198,7 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_data *idata = irq_desc_get_irq_data(desc); - struct uic *uic = irq_get_handler_data(virq); + struct uic *uic = irq_desc_get_handler_data(desc); u32 msr; int src; int subvirq; diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 2fc4cf1b7557..eae32654bdf2 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -147,12 +147,16 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) { kvmppc_set_host_ipi(cpu, 1); #ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL) && - (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))) - doorbell_cause_ipi(cpu, data); - else + if (cpu_has_feature(CPU_FTR_DBELL)) { + if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) { + doorbell_cause_ipi(cpu, data); + put_cpu(); + return; + } + put_cpu(); + } #endif - icp_native_set_qirr(cpu, IPI_PRIORITY); + icp_native_set_qirr(cpu, IPI_PRIORITY); } /* diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 5bc5889d4acc..08c248eb491b 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -227,7 +227,7 @@ void xics_migrate_irqs_away(void) /* Locate interrupt server */ server = -1; - ics = irq_get_chip_data(virq); + ics = irq_desc_get_chip_data(desc); if (ics) server = ics->get_server(ics, irq); if (server < 0) { |