From e836a256e8fd579c9d7a3685f22981225a1ca451 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Aug 2015 18:42:56 -0400 Subject: pmem: convert to generic memremap Kill arch_memremap_pmem() and just let the architecture specify the flags to be passed to memremap(). Default to writethrough by default. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Ross Zwisler Signed-off-by: Dan Williams --- arch/x86/include/asm/io.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index cc9c61bc1abe..d241fbd5c87b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -248,11 +248,7 @@ static inline void flush_write_buffers(void) #endif } -static inline void __pmem *arch_memremap_pmem(resource_size_t offset, - unsigned long size) -{ - return (void __force __pmem *) ioremap_cache(offset, size); -} +#define ARCH_MEMREMAP_PMEM MEMREMAP_WB #endif /* __KERNEL__ */ -- cgit v1.2.3-58-ga151 From 7a67832c7e44c20935c5d6f2264035a0f7bf0d8f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 19 Aug 2015 00:34:34 -0400 Subject: libnvdimm, e820: make CONFIG_X86_PMEM_LEGACY a tristate option We currently register a platform device for e820 type-12 memory and register a nvdimm bus beneath it. Registering the platform device triggers the device-core machinery to probe for a driver, but that search currently comes up empty. Building the nvdimm-bus registration into the e820_pmem platform device registration in this way forces libnvdimm to be built-in. Instead, convert the built-in portion of CONFIG_X86_PMEM_LEGACY to simply register a platform device and move the rest of the logic to the driver for e820_pmem, for the following reasons: 1/ Letting e820_pmem support be a module allows building and testing libnvdimm.ko changes without rebooting 2/ All the normal policy around modules can be applied to e820_pmem (unbind to disable and/or blacklisting the module from loading by default) 3/ Moving the driver to a generic location and converting it to scan "iomem_resource" rather than "e820.map" means any other architecture can take advantage of this simple nvdimm resource discovery mechanism by registering a resource named "Persistent Memory (legacy)" Cc: Christoph Hellwig Signed-off-by: Dan Williams --- arch/x86/Kconfig | 6 ++- arch/x86/include/uapi/asm/e820.h | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/pmem.c | 79 ++++-------------------------------- drivers/nvdimm/Makefile | 3 ++ drivers/nvdimm/e820.c | 86 ++++++++++++++++++++++++++++++++++++++++ tools/testing/nvdimm/Kbuild | 4 ++ 7 files changed, 108 insertions(+), 74 deletions(-) create mode 100644 drivers/nvdimm/e820.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d77d92..76c61154ed50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1426,10 +1426,14 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY_DEVICE + bool + config X86_PMEM_LEGACY - bool "Support non-standard NVDIMMs and ADR protected memory" + tristate "Support non-standard NVDIMMs and ADR protected memory" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV + select X86_PMEM_LEGACY_DEVICE select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 0f457e6eab18..9dafe59cf6e2 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -37,7 +37,7 @@ /* * This is a non-standardized way to represent ADR or NVDIMM regions that * persist over a reboot. The kernel will ignore their special capabilities - * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * unless the CONFIG_X86_PMEM_LEGACY option is set. * * ( Note that older platforms also used 6 for the same type of memory, * but newer versions switched to 12 as 6 was assigned differently. Some diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0f15af41bd80..ac2bb7e28ba2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -92,7 +92,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o -obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 64f90f53bb85..4f00b63d7ff3 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,80 +3,17 @@ * Copyright (c) 2015, Intel Corporation. */ #include -#include #include -#include - -static void e820_pmem_release(struct device *dev) -{ - struct nvdimm_bus *nvdimm_bus = dev->platform_data; - - if (nvdimm_bus) - nvdimm_bus_unregister(nvdimm_bus); -} - -static struct platform_device e820_pmem = { - .name = "e820_pmem", - .id = -1, - .dev = { - .release = e820_pmem_release, - }, -}; - -static const struct attribute_group *e820_pmem_attribute_groups[] = { - &nvdimm_bus_attribute_group, - NULL, -}; - -static const struct attribute_group *e820_pmem_region_attribute_groups[] = { - &nd_region_attribute_group, - &nd_device_attribute_group, - NULL, -}; static __init int register_e820_pmem(void) { - static struct nvdimm_bus_descriptor nd_desc; - struct device *dev = &e820_pmem.dev; - struct nvdimm_bus *nvdimm_bus; - int rc, i; - - rc = platform_device_register(&e820_pmem); - if (rc) - return rc; - - nd_desc.attr_groups = e820_pmem_attribute_groups; - nd_desc.provider_name = "e820"; - nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); - if (!nvdimm_bus) - goto err; - dev->platform_data = nvdimm_bus; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - struct nd_region_desc ndr_desc; - - if (ei->type != E820_PRAM) - continue; - - memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.res = &res; - ndr_desc.attr_groups = e820_pmem_region_attribute_groups; - ndr_desc.numa_node = NUMA_NO_NODE; - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) - goto err; - } - - return 0; - - err: - dev_err(dev, "failed to register legacy persistent memory ranges\n"); - platform_device_unregister(&e820_pmem); - return -ENXIO; + struct platform_device *pdev; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + return platform_device_add(pdev); } device_initcall(register_e820_pmem); diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 594bb97c867a..9bf15db52dee 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o nd_pmem-y := pmem.o @@ -9,6 +10,8 @@ nd_btt-y := btt.o nd_blk-y := blk.o +nd_e820-y := e820.o + libnvdimm-y := core.o libnvdimm-y += bus.o libnvdimm-y += dimm_devs.o diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c new file mode 100644 index 000000000000..1b5743ad92db --- /dev/null +++ b/drivers/nvdimm/e820.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include +#include +#include + +static const struct attribute_group *e820_pmem_attribute_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; + +static const struct attribute_group *e820_pmem_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + NULL, +}; + +static int e820_pmem_remove(struct platform_device *pdev) +{ + struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(nvdimm_bus); + return 0; +} + +static int e820_pmem_probe(struct platform_device *pdev) +{ + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &pdev->dev; + struct nvdimm_bus *nvdimm_bus; + struct resource *p; + + nd_desc.attr_groups = e820_pmem_attribute_groups; + nd_desc.provider_name = "e820"; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + platform_set_drvdata(pdev, nvdimm_bus); + + for (p = iomem_resource.child; p ; p = p->sibling) { + struct nd_region_desc ndr_desc; + + if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0) + continue; + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = p; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = NUMA_NO_NODE; + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err; + } + + return 0; + + err: + nvdimm_bus_unregister(nvdimm_bus); + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + return -ENXIO; +} + +static struct platform_driver e820_pmem_driver = { + .probe = e820_pmem_probe, + .remove = e820_pmem_remove, + .driver = { + .name = "e820_pmem", + }, +}; + +static __init int e820_pmem_init(void) +{ + return platform_driver_register(&e820_pmem_driver); +} + +static __exit void e820_pmem_exit(void) +{ + platform_driver_unregister(&e820_pmem_driver); +} + +MODULE_ALIAS("platform:e820_pmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +module_init(e820_pmem_init); +module_exit(e820_pmem_exit); diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index f56914c7929b..d7c136a96346 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -15,6 +15,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_ACPI_NFIT) += nfit.o nfit-y := $(ACPI_SRC)/nfit.o @@ -29,6 +30,9 @@ nd_btt-y += config_check.o nd_blk-y := $(NVDIMM_SRC)/blk.o nd_blk-y += config_check.o +nd_e820-y := $(NVDIMM_SRC)/e820.o +nd_e820-y += config_check.o + libnvdimm-y := $(NVDIMM_SRC)/core.o libnvdimm-y += $(NVDIMM_SRC)/bus.o libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o -- cgit v1.2.3-58-ga151 From 40603526569b304dd92f720f2f8ab11e828ea145 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:36 -0600 Subject: pmem, x86: move x86 PMEM API to new pmem.h header Move the x86 PMEM API implementation out of asm/cacheflush.h and into its own header asm/pmem.h. This will allow members of the PMEM API to be more easily identified on this and other architectures. Signed-off-by: Ross Zwisler Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- MAINTAINERS | 1 + arch/x86/include/asm/cacheflush.h | 71 ------------------------------ arch/x86/include/asm/pmem.h | 92 +++++++++++++++++++++++++++++++++++++++ include/linux/pmem.h | 2 +- 4 files changed, 94 insertions(+), 72 deletions(-) create mode 100644 arch/x86/include/asm/pmem.h (limited to 'arch/x86') diff --git a/MAINTAINERS b/MAINTAINERS index 9289ecb57b68..8fcde3717ab7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6161,6 +6161,7 @@ Q: https://patchwork.kernel.org/project/linux-nvdimm/list/ S: Supported F: drivers/nvdimm/pmem.c F: include/linux/pmem.h +F: arch/*/include/asm/pmem.h LINUX FOR IBM pSERIES (RS/6000) M: Paul Mackerras diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9bf3ea14b9f0..471418ac1ff9 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -109,75 +109,4 @@ static inline int rodata_test(void) } #endif -#ifdef ARCH_HAS_NOCACHE_UACCESS - -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. - */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, - size_t n) -{ - int unwritten; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, - (void __user *) src, n); - if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, unwritten)) - BUG(); -} - -/** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ - /* - * wmb() to 'sfence' all previous writes such that they are - * architecturally visible to 'pcommit'. Note, that we've - * already arranged for pmem writes to avoid the cache via - * arch_memcpy_to_pmem(). - */ - wmb(); - pcommit_sfence(); -} - -static inline bool __arch_has_wmb_pmem(void) -{ -#ifdef CONFIG_X86_64 - /* - * We require that wmb() be an 'sfence', that is only guaranteed on - * 64-bit builds - */ - return static_cpu_has(X86_FEATURE_PCOMMIT); -#else - return false; -#endif -} -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ - return false; -} -#endif - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000000..f43462cc91aa --- /dev/null +++ b/arch/x86/include/asm/pmem.h @@ -0,0 +1,92 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include +#include +#include +#include + +#ifdef ARCH_HAS_NOCACHE_UACCESS + +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +static inline bool __arch_has_wmb_pmem(void) +{ +#ifdef CONFIG_X86_64 + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +#else + return false; +#endif +} +#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ +extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); +extern void arch_wmb_pmem(void); + +static inline bool __arch_has_wmb_pmem(void) +{ + return false; +} +#endif + +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h index 20c367cd76e6..c2af613ec297 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -16,7 +16,7 @@ #include #ifdef CONFIG_ARCH_HAS_PMEM_API -#include +#include #else static inline void arch_wmb_pmem(void) { -- cgit v1.2.3-58-ga151 From 18279b467a9d89afe44afbc19d768e834dbf4545 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:37 -0600 Subject: pmem: remove layer when calling arch_has_wmb_pmem() Prior to this change arch_has_wmb_pmem() was only called by arch_has_pmem_api(). Both arch_has_wmb_pmem() and arch_has_pmem_api() checked to make sure that CONFIG_ARCH_HAS_PMEM_API was enabled. Instead, remove the old arch_has_wmb_pmem() wrapper to be rid of one extra layer of indirection and the redundant CONFIG_ARCH_HAS_PMEM_API check. Rename __arch_has_wmb_pmem() to arch_has_wmb_pmem() since we no longer have a wrapper, and just have arch_has_pmem_api() call the architecture specific arch_has_wmb_pmem() directly. Signed-off-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/x86/include/asm/pmem.h | 2 +- include/linux/pmem.h | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index f43462cc91aa..1e8dbb72d6ee 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -67,7 +67,7 @@ static inline void arch_wmb_pmem(void) pcommit_sfence(); } -static inline bool __arch_has_wmb_pmem(void) +static inline bool arch_has_wmb_pmem(void) { #ifdef CONFIG_X86_64 /* diff --git a/include/linux/pmem.h b/include/linux/pmem.h index c2af613ec297..a0706ea04efd 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -23,7 +23,7 @@ static inline void arch_wmb_pmem(void) BUG(); } -static inline bool __arch_has_wmb_pmem(void) +static inline bool arch_has_wmb_pmem(void) { return false; } @@ -38,7 +38,7 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, /* * Architectures that define ARCH_HAS_PMEM_API must provide * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), and - * __arch_has_wmb_pmem(). + * arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) @@ -52,7 +52,7 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr) } /** - * arch_has_wmb_pmem - true if wmb_pmem() ensures durability + * arch_has_pmem_api - true if wmb_pmem() ensures durability * * For a given cpu implementation within an architecture it is possible * that wmb_pmem() resolves to a nop. In the case this returns @@ -60,13 +60,6 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr) * fall back to a different data consistency model, or otherwise notify * the user. */ -static inline bool arch_has_wmb_pmem(void) -{ - if (IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)) - return __arch_has_wmb_pmem(); - return false; -} - static inline bool arch_has_pmem_api(void) { return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); -- cgit v1.2.3-58-ga151 From 4a370df5534ef727cba9a9d74bf22e0609f91d6e Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:38 -0600 Subject: pmem, x86: clean up conditional pmem includes Prior to this change x86_64 used the pmem defines in arch/x86/include/asm/pmem.h, and UM used the default ones at the top of include/linux/pmem.h. The inclusion or exclusion in linux/pmem.h was controlled by CONFIG_ARCH_HAS_PMEM_API, but the ones in asm/pmem.h were controlled by ARCH_HAS_NOCACHE_UACCESS. Instead, control them both with CONFIG_ARCH_HAS_PMEM_API so that it's clear that they are related and we don't run into the possibility where they are both included or excluded. Also remove a bunch of stale function prototypes meant for UM in asm/pmem.h - these just conflicted with the inline defaults in linux/pmem.h and gave compile errors. Signed-off-by: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/x86/include/asm/pmem.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 1e8dbb72d6ee..7f3413fce46c 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -18,8 +18,7 @@ #include #include -#ifdef ARCH_HAS_NOCACHE_UACCESS - +#ifdef CONFIG_ARCH_HAS_PMEM_API /** * arch_memcpy_to_pmem - copy data to persistent memory * @dst: destination buffer for the copy @@ -79,14 +78,6 @@ static inline bool arch_has_wmb_pmem(void) return false; #endif } -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ - return false; -} -#endif +#endif /* CONFIG_ARCH_HAS_PMEM_API */ #endif /* __ASM_X86_PMEM_H__ */ -- cgit v1.2.3-58-ga151 From 5de490daec8b6354b90d5c9d3e2415b195f5adb6 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 18 Aug 2015 13:55:39 -0600 Subject: pmem: add copy_from_iter_pmem() and clear_pmem() Add support for two new PMEM APIs, copy_from_iter_pmem() and clear_pmem(). copy_from_iter_pmem() is used to copy data from an iterator into a PMEM buffer. clear_pmem() zeros a PMEM memory range. Both of these new APIs must be explicitly ordered using a wmb_pmem() function call and are implemented in such a way that the wmb_pmem() will make the stores to PMEM durable. Because both APIs are unordered they can be called as needed without introducing any unwanted memory barriers. Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- arch/x86/include/asm/pmem.h | 75 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/pmem.h | 64 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 137 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 7f3413fce46c..a3a0df6545ee 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -66,6 +66,81 @@ static inline void arch_wmb_pmem(void) pcommit_sfence(); } +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. This function requires explicit ordering with an + * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; + + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ + return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + void *vaddr = (void __force *)addr; + size_t len; + + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + + if (__iter_needs_pmem_wb(i)) + __arch_wb_cache_pmem(vaddr, bytes); + + return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + void *vaddr = (void __force *)addr; + + /* TODO: implement the zeroing via non-temporal writes */ + if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) + clear_page(vaddr); + else + memset(vaddr, 0, size); + + __arch_wb_cache_pmem(vaddr, size); +} + static inline bool arch_has_wmb_pmem(void) { #ifdef CONFIG_X86_64 diff --git a/include/linux/pmem.h b/include/linux/pmem.h index a0706ea04efd..a9d84bf335ee 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -14,6 +14,7 @@ #define __PMEM_H__ #include +#include #ifdef CONFIG_ARCH_HAS_PMEM_API #include @@ -33,12 +34,24 @@ static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, { BUG(); } + +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + BUG(); + return 0; +} + +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + BUG(); +} #endif /* * Architectures that define ARCH_HAS_PMEM_API must provide - * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), and - * arch_has_wmb_pmem(). + * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), + * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) @@ -78,6 +91,20 @@ static inline void default_memcpy_to_pmem(void __pmem *dst, const void *src, memcpy((void __force *) dst, src, size); } +static inline size_t default_copy_from_iter_pmem(void __pmem *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter_nocache((void __force *)addr, bytes, i); +} + +static inline void default_clear_pmem(void __pmem *addr, size_t size) +{ + if (size == PAGE_SIZE && ((unsigned long)addr & ~PAGE_MASK) == 0) + clear_page((void __force *)addr); + else + memset((void __force *)addr, 0, size); +} + /** * memremap_pmem - map physical persistent memory for pmem api * @offset: physical address of persistent memory @@ -134,4 +161,37 @@ static inline void wmb_pmem(void) if (arch_has_pmem_api()) arch_wmb_pmem(); } + +/** + * copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline size_t copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + if (arch_has_pmem_api()) + return arch_copy_from_iter_pmem(addr, bytes, i); + return default_copy_from_iter_pmem(addr, bytes, i); +} + +/** + * clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline void clear_pmem(void __pmem *addr, size_t size) +{ + if (arch_has_pmem_api()) + arch_clear_pmem(addr, size); + else + default_clear_pmem(addr, size); +} #endif /* __PMEM_H__ */ -- cgit v1.2.3-58-ga151 From 67a3e8fe90156d41cd480d3dfbb40f3bc007c262 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 27 Aug 2015 13:14:20 -0600 Subject: nd_blk: change aperture mapping from WC to WB This should result in a pretty sizeable performance gain for reads. For rough comparison I did some simple read testing using PMEM to compare reads of write combining (WC) mappings vs write-back (WB). This was done on a random lab machine. PMEM reads from a write combining mapping: # dd of=/dev/null if=/dev/pmem0 bs=4096 count=100000 100000+0 records in 100000+0 records out 409600000 bytes (410 MB) copied, 9.2855 s, 44.1 MB/s PMEM reads from a write-back mapping: # dd of=/dev/null if=/dev/pmem0 bs=4096 count=1000000 1000000+0 records in 1000000+0 records out 4096000000 bytes (4.1 GB) copied, 3.44034 s, 1.2 GB/s To be able to safely support a write-back aperture I needed to add support for the "read flush" _DSM flag, as outlined in the DSM spec: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf This flag tells the ND BLK driver that it needs to flush the cache lines associated with the aperture after the aperture is moved but before any new data is read. This ensures that any stale cache lines from the previous contents of the aperture will be discarded from the processor cache, and the new data will be read properly from the DIMM. We know that the cache lines are clean and will be discarded without any writeback because either a) the previous aperture operation was a read, and we never modified the contents of the aperture, or b) the previous aperture operation was a write and we must have written back the dirtied contents of the aperture to the DIMM before the I/O was completed. In order to add support for the "read flush" flag I needed to add a generic routine to invalidate cache lines, mmio_flush_range(). This is protected by the ARCH_HAS_MMIO_FLUSH Kconfig variable, and is currently only supported on x86. Signed-off-by: Ross Zwisler Signed-off-by: Dan Williams --- arch/x86/Kconfig | 1 + arch/x86/include/asm/cacheflush.h | 2 ++ arch/x86/include/asm/io.h | 2 -- arch/x86/include/asm/pmem.h | 2 ++ drivers/acpi/Kconfig | 1 + drivers/acpi/nfit.c | 55 ++++++++++++++++++++++----------------- drivers/acpi/nfit.h | 16 ++++++++---- lib/Kconfig | 3 +++ tools/testing/nvdimm/Kbuild | 2 ++ tools/testing/nvdimm/test/iomap.c | 30 +++++++++++++++++++-- tools/testing/nvdimm/test/nfit.c | 10 ++++--- 11 files changed, 88 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d77d92..5d4980e6bc4f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PMEM_API + select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 471418ac1ff9..e63aa38e85fb 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages); void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d241fbd5c87b..83ec9b1d77cc 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -248,8 +248,6 @@ static inline void flush_write_buffers(void) #endif } -#define ARCH_MEMREMAP_PMEM MEMREMAP_WB - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index a3a0df6545ee..bb026c5adf8a 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -18,6 +18,8 @@ #include #include +#define ARCH_MEMREMAP_PMEM MEMREMAP_WB + #ifdef CONFIG_ARCH_HAS_PMEM_API /** * arch_memcpy_to_pmem - copy data to persistent memory diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 114cf48085ab..4baeb853e0c3 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -410,6 +410,7 @@ config ACPI_NFIT tristate "ACPI NVDIMM Firmware Interface Table (NFIT)" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV + depends on ARCH_HAS_MMIO_FLUSH select LIBNVDIMM help Infrastructure to probe ACPI 6 compliant platforms for diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 7c2638f914a9..56fff0141636 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -1017,7 +1017,7 @@ static u64 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw) if (mmio->num_lines) offset = to_interleave_offset(offset, mmio); - return readq(mmio->base + offset); + return readq(mmio->addr.base + offset); } static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, @@ -1042,11 +1042,11 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw, if (mmio->num_lines) offset = to_interleave_offset(offset, mmio); - writeq(cmd, mmio->base + offset); + writeq(cmd, mmio->addr.base + offset); wmb_blk(nfit_blk); if (nfit_blk->dimm_flags & ND_BLK_DCR_LATCH) - readq(mmio->base + offset); + readq(mmio->addr.base + offset); } static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, @@ -1078,11 +1078,16 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, } if (rw) - memcpy_to_pmem(mmio->aperture + offset, + memcpy_to_pmem(mmio->addr.aperture + offset, iobuf + copied, c); - else + else { + if (nfit_blk->dimm_flags & ND_BLK_READ_FLUSH) + mmio_flush_range((void __force *) + mmio->addr.aperture + offset, c); + memcpy_from_pmem(iobuf + copied, - mmio->aperture + offset, c); + mmio->addr.aperture + offset, c); + } copied += c; len -= c; @@ -1129,7 +1134,10 @@ static void nfit_spa_mapping_release(struct kref *kref) WARN_ON(!mutex_is_locked(&acpi_desc->spa_map_mutex)); dev_dbg(acpi_desc->dev, "%s: SPA%d\n", __func__, spa->range_index); - iounmap(spa_map->iomem); + if (spa_map->type == SPA_MAP_APERTURE) + memunmap((void __force *)spa_map->addr.aperture); + else + iounmap(spa_map->addr.base); release_mem_region(spa->address, spa->length); list_del(&spa_map->list); kfree(spa_map); @@ -1175,7 +1183,7 @@ static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc, spa_map = find_spa_mapping(acpi_desc, spa); if (spa_map) { kref_get(&spa_map->kref); - return spa_map->iomem; + return spa_map->addr.base; } spa_map = kzalloc(sizeof(*spa_map), GFP_KERNEL); @@ -1191,20 +1199,19 @@ static void __iomem *__nfit_spa_map(struct acpi_nfit_desc *acpi_desc, if (!res) goto err_mem; - if (type == SPA_MAP_APERTURE) { - /* - * TODO: memremap_pmem() support, but that requires cache - * flushing when the aperture is moved. - */ - spa_map->iomem = ioremap_wc(start, n); - } else - spa_map->iomem = ioremap_nocache(start, n); + spa_map->type = type; + if (type == SPA_MAP_APERTURE) + spa_map->addr.aperture = (void __pmem *)memremap(start, n, + ARCH_MEMREMAP_PMEM); + else + spa_map->addr.base = ioremap_nocache(start, n); + - if (!spa_map->iomem) + if (!spa_map->addr.base) goto err_map; list_add_tail(&spa_map->list, &acpi_desc->spa_maps); - return spa_map->iomem; + return spa_map->addr.base; err_map: release_mem_region(start, n); @@ -1267,7 +1274,7 @@ static int acpi_nfit_blk_get_flags(struct nvdimm_bus_descriptor *nd_desc, nfit_blk->dimm_flags = flags.flags; else if (rc == -ENOTTY) { /* fall back to a conservative default */ - nfit_blk->dimm_flags = ND_BLK_DCR_LATCH; + nfit_blk->dimm_flags = ND_BLK_DCR_LATCH | ND_BLK_READ_FLUSH; rc = 0; } else rc = -ENXIO; @@ -1307,9 +1314,9 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, /* map block aperture memory */ nfit_blk->bdw_offset = nfit_mem->bdw->offset; mmio = &nfit_blk->mmio[BDW]; - mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw, + mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_bdw, SPA_MAP_APERTURE); - if (!mmio->base) { + if (!mmio->addr.base) { dev_dbg(dev, "%s: %s failed to map bdw\n", __func__, nvdimm_name(nvdimm)); return -ENOMEM; @@ -1330,9 +1337,9 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, nfit_blk->cmd_offset = nfit_mem->dcr->command_offset; nfit_blk->stat_offset = nfit_mem->dcr->status_offset; mmio = &nfit_blk->mmio[DCR]; - mmio->base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr, + mmio->addr.base = nfit_spa_map(acpi_desc, nfit_mem->spa_dcr, SPA_MAP_CONTROL); - if (!mmio->base) { + if (!mmio->addr.base) { dev_dbg(dev, "%s: %s failed to map dcr\n", __func__, nvdimm_name(nvdimm)); return -ENOMEM; @@ -1399,7 +1406,7 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus, for (i = 0; i < 2; i++) { struct nfit_blk_mmio *mmio = &nfit_blk->mmio[i]; - if (mmio->base) + if (mmio->addr.base) nfit_spa_unmap(acpi_desc, mmio->spa); } nd_blk_region_set_provider_data(ndbr, NULL); diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h index f2c2bb751882..7e740156b9c2 100644 --- a/drivers/acpi/nfit.h +++ b/drivers/acpi/nfit.h @@ -41,6 +41,7 @@ enum nfit_uuids { }; enum { + ND_BLK_READ_FLUSH = 1, ND_BLK_DCR_LATCH = 2, }; @@ -117,12 +118,16 @@ enum nd_blk_mmio_selector { DCR, }; +struct nd_blk_addr { + union { + void __iomem *base; + void __pmem *aperture; + }; +}; + struct nfit_blk { struct nfit_blk_mmio { - union { - void __iomem *base; - void __pmem *aperture; - }; + struct nd_blk_addr addr; u64 size; u64 base_offset; u32 line_size; @@ -149,7 +154,8 @@ struct nfit_spa_mapping { struct acpi_nfit_system_address *spa; struct list_head list; struct kref kref; - void __iomem *iomem; + enum spa_map_type type; + struct nd_blk_addr addr; }; static inline struct nfit_spa_mapping *to_spa_map(struct kref *kref) diff --git a/lib/Kconfig b/lib/Kconfig index 3a2ef67db6c7..a938a39191b3 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -531,4 +531,7 @@ config ARCH_HAS_SG_CHAIN config ARCH_HAS_PMEM_API bool +config ARCH_HAS_MMIO_FLUSH + bool + endmenu diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 04c5fc09576d..94a5e0eda2d2 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -1,8 +1,10 @@ ldflags-y += --wrap=ioremap_wc +ldflags-y += --wrap=memremap ldflags-y += --wrap=devm_ioremap_nocache ldflags-y += --wrap=devm_memremap ldflags-y += --wrap=ioremap_nocache ldflags-y += --wrap=iounmap +ldflags-y += --wrap=memunmap ldflags-y += --wrap=__devm_request_region ldflags-y += --wrap=__request_region ldflags-y += --wrap=__release_region diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index ff1e00458864..179d2289f3a8 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -89,12 +89,25 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, nfit_res = get_nfit_res(offset); rcu_read_unlock(); if (nfit_res) - return (void __iomem *) nfit_res->buf + offset - - nfit_res->res->start; + return nfit_res->buf + offset - nfit_res->res->start; return devm_memremap(dev, offset, size, flags); } EXPORT_SYMBOL(__wrap_devm_memremap); +void *__wrap_memremap(resource_size_t offset, size_t size, + unsigned long flags) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res(offset); + rcu_read_unlock(); + if (nfit_res) + return nfit_res->buf + offset - nfit_res->res->start; + return memremap(offset, size, flags); +} +EXPORT_SYMBOL(__wrap_memremap); + void __iomem *__wrap_ioremap_nocache(resource_size_t offset, unsigned long size) { return __nfit_test_ioremap(offset, size, ioremap_nocache); @@ -120,6 +133,19 @@ void __wrap_iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(__wrap_iounmap); +void __wrap_memunmap(void *addr) +{ + struct nfit_test_resource *nfit_res; + + rcu_read_lock(); + nfit_res = get_nfit_res((unsigned long) addr); + rcu_read_unlock(); + if (nfit_res) + return; + return memunmap(addr); +} +EXPORT_SYMBOL(__wrap_memunmap); + static struct resource *nfit_test_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 28dba918524e..021e6f97f33e 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1029,9 +1029,13 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, lane = nd_region_acquire_lane(nd_region); if (rw) - memcpy(mmio->base + dpa, iobuf, len); - else - memcpy(iobuf, mmio->base + dpa, len); + memcpy(mmio->addr.base + dpa, iobuf, len); + else { + memcpy(iobuf, mmio->addr.base + dpa, len); + + /* give us some some coverage of the mmio_flush_range() API */ + mmio_flush_range(mmio->addr.base + dpa, len); + } nd_region_release_lane(nd_region, lane); return 0; -- cgit v1.2.3-58-ga151 From 033fbae988fcb67e5077203512181890848b8e90 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 9 Aug 2015 15:29:06 -0400 Subject: mm: ZONE_DEVICE for "device memory" While pmem is usable as a block device or via DAX mappings to userspace there are several usage scenarios that can not target pmem due to its lack of struct page coverage. In preparation for "hot plugging" pmem into the vmemmap add ZONE_DEVICE as a new zone to tag these pages separately from the ones that are subject to standard page allocations. Importantly "device memory" can be removed at will by userspace unbinding the driver of the device. Having a separate zone prevents allocation and otherwise marks these pages that are distinct from typical uniform memory. Device memory has different lifetime and performance characteristics than RAM. However, since we have run out of ZONES_SHIFT bits this functionality currently depends on sacrificing ZONE_DMA. Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Jerome Glisse [hch: various simplifications in the arch interface] Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 5 +++-- arch/tile/mm/init.c | 2 +- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/linux/memory_hotplug.h | 5 +++-- include/linux/mmzone.h | 23 +++++++++++++++++++++++ mm/Kconfig | 17 +++++++++++++++++ mm/memory_hotplug.c | 14 +++++++++++--- mm/page_alloc.c | 3 +++ 12 files changed, 70 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 97e48b0eefc7..1841ef69183d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -645,7 +645,7 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; struct zone *zone; @@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0f11819d8f1d..6571cfb05668 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata; struct zone *zone; @@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size) /* this should work for most non-highmem platforms */ zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0); + zone_for_memory(nid, start, size, 0, for_device); return __add_pages(nid, zone, start_pfn, nr_pages); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 76e873748b56..48ee78be88ba 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,7 +168,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2790b6a64157..c1490096b863 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; unsigned long start_pfn = start >> PAGE_SHIFT; @@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size) /* We only have ZONE_NORMAL, so this is easy.. */ ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL), + zone_for_memory(nid, start, size, ZONE_NORMAL, + for_device), start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 5bd252e3fdc5..d4e1fc41d06d 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -863,7 +863,7 @@ void __init mem_init(void) * memory to the highmem for now. */ #ifndef CONFIG_NEED_MULTIPLE_NODES -int arch_add_memory(u64 start, u64 size) +int arch_add_memory(u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = &contig_page_data; struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8340e45c891a..2a9237d20a70 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -822,11 +822,11 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM); + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3fba623e3ba5..30564e2752d3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 6ffa0ac7f7d6..8f60e899b33c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -266,8 +266,9 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); -extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default); -extern int arch_add_memory(int nid, u64 start, u64 size); +extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 754c25966a0a..9217fd93c25b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -319,7 +319,11 @@ enum zone_type { ZONE_HIGHMEM, #endif ZONE_MOVABLE, +#ifdef CONFIG_ZONE_DEVICE + ZONE_DEVICE, +#endif __MAX_NR_ZONES + }; #ifndef __GENERATING_BOUNDS_H @@ -794,6 +798,25 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } +static inline int zone_id(const struct zone *zone) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + return zone - pgdat->node_zones; +} + +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_dev_zone(const struct zone *zone) +{ + return zone_id(zone) == ZONE_DEVICE; +} +#else +static inline bool is_dev_zone(const struct zone *zone) +{ + return false; +} +#endif + #include extern struct mutex zonelists_mutex; diff --git a/mm/Kconfig b/mm/Kconfig index e79de2bd12cd..a0cd086df16b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -654,3 +654,20 @@ config DEFERRED_STRUCT_PAGE_INIT when kswapd starts. This has a potential performance impact on processes running early in the lifetime of the systemm until kswapd finishes the initialisation. + +config ZONE_DEVICE + bool "Device memory (pmem, etc...) hotplug support" if EXPERT + default !ZONE_DMA + depends on !ZONE_DMA + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on X86_64 #arch_add_memory() comprehends device memory + + help + Device memory hotplug support allows for establishing pmem, + or other device driver discovered memory regions, in the + memmap. This allows pfn_to_page() lookups of otherwise + "device-physical" addresses which is needed for using a DAX + mapping in an O_DIRECT operation, among other things. + + If FS_DAX is enabled, then say Y. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 26fbba7d888f..24e4c76c951b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -770,7 +770,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, start = phys_start_pfn << PAGE_SHIFT; size = nr_pages * PAGE_SIZE; - ret = release_mem_region_adjustable(&iomem_resource, start, size); + + /* in the ZONE_DEVICE case device driver owns the memory region */ + if (!is_dev_zone(zone)) + ret = release_mem_region_adjustable(&iomem_resource, start, size); if (ret) { resource_size_t endres = start + size - 1; @@ -1207,8 +1210,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size) return 0; } -int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device) { +#ifdef CONFIG_ZONE_DEVICE + if (for_device) + return ZONE_DEVICE; +#endif if (should_add_memory_movable(nid, start, size)) return ZONE_MOVABLE; @@ -1249,7 +1257,7 @@ int __ref add_memory(int nid, u64 start, u64 size) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size); + ret = arch_add_memory(nid, start, size, false); if (ret < 0) goto error; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef19f22b2b7d..0f19b4e18233 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -207,6 +207,9 @@ static char * const zone_names[MAX_NR_ZONES] = { "HighMem", #endif "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif }; int min_free_kbytes = 1024; -- cgit v1.2.3-58-ga151 From 96601adb745186ccbcf5b078d4756f13381ec2af Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 24 Aug 2015 18:29:38 -0400 Subject: x86, pmem: clarify that ARCH_HAS_PMEM_API implies PMEM mapped WB Given that a write-back (WB) mapping plus non-temporal stores is expected to be the most efficient way to access PMEM, update the definition of ARCH_HAS_PMEM_API to imply arch support for WB-mapped-PMEM. This is needed as a pre-requisite for adding PMEM to the direct map and mapping it with struct page. The above clarification for X86_64 means that memcpy_to_pmem() is permitted to use the non-temporal arch_memcpy_to_pmem() rather than needlessly fall back to default_memcpy_to_pmem() when the pcommit instruction is not available. When arch_memcpy_to_pmem() is not guaranteed to flush writes out of cache, i.e. on older X86_32 implementations where non-temporal stores may just dirty cache, ARCH_HAS_PMEM_API is simply disabled. The default fall back for persistent memory handling remains. Namely, map it with the WT (write-through) cache-type and hope for the best. arch_has_pmem_api() is updated to only indicate whether the arch provides the proper helpers to meet the minimum "writes are visible outside the cache hierarchy after memcpy_to_pmem() + wmb_pmem()". Code that cares whether wmb_pmem() actually flushes writes to pmem must now call arch_has_wmb_pmem() directly. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Reviewed-by: Ross Zwisler [hch: set ARCH_HAS_PMEM_API=n on x86_32] Reviewed-by: Christoph Hellwig [toshi: x86_32 compile fixes] Signed-off-by: Toshi Kani Signed-off-by: Dan Williams --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/pmem.h | 9 +-------- drivers/acpi/nfit.c | 3 ++- drivers/nvdimm/pmem.c | 2 +- include/linux/pmem.h | 36 ++++++++++++++++++++++-------------- 5 files changed, 27 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 03ab6122325a..ef4c6bbb3af1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,7 +27,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_PMEM_API + select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index bb026c5adf8a..d8ce3ec816ab 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -18,8 +18,6 @@ #include #include -#define ARCH_MEMREMAP_PMEM MEMREMAP_WB - #ifdef CONFIG_ARCH_HAS_PMEM_API /** * arch_memcpy_to_pmem - copy data to persistent memory @@ -143,18 +141,13 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) __arch_wb_cache_pmem(vaddr, size); } -static inline bool arch_has_wmb_pmem(void) +static inline bool __arch_has_wmb_pmem(void) { -#ifdef CONFIG_X86_64 /* * We require that wmb() be an 'sfence', that is only guaranteed on * 64-bit builds */ return static_cpu_has(X86_FEATURE_PCOMMIT); -#else - return false; -#endif } #endif /* CONFIG_ARCH_HAS_PMEM_API */ - #endif /* __ASM_X86_PMEM_H__ */ diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c index 56fff0141636..f61e69fa2ad1 100644 --- a/drivers/acpi/nfit.c +++ b/drivers/acpi/nfit.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "nfit.h" /* @@ -1371,7 +1372,7 @@ static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus, return -ENOMEM; } - if (!arch_has_pmem_api() && !nfit_blk->nvdimm_flush) + if (!arch_has_wmb_pmem() && !nfit_blk->nvdimm_flush) dev_warn(dev, "unable to guarantee persistence of writes\n"); if (mmio->line_size == 0) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 3b5b9cb758b6..20bf122328da 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -125,7 +125,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->phys_addr = res->start; pmem->size = resource_size(res); - if (!arch_has_pmem_api()) + if (!arch_has_wmb_pmem()) dev_warn(dev, "unable to guarantee persistence of writes\n"); if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size, diff --git a/include/linux/pmem.h b/include/linux/pmem.h index a9d84bf335ee..85f810b33917 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -17,16 +17,23 @@ #include #ifdef CONFIG_ARCH_HAS_PMEM_API +#define ARCH_MEMREMAP_PMEM MEMREMAP_WB #include #else -static inline void arch_wmb_pmem(void) +#define ARCH_MEMREMAP_PMEM MEMREMAP_WT +/* + * These are simply here to enable compilation, all call sites gate + * calling these symbols with arch_has_pmem_api() and redirect to the + * implementation in asm/pmem.h. + */ +static inline bool __arch_has_wmb_pmem(void) { - BUG(); + return false; } -static inline bool arch_has_wmb_pmem(void) +static inline void arch_wmb_pmem(void) { - return false; + BUG(); } static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, @@ -53,7 +60,6 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). */ - static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { memcpy(dst, (void __force const *) src, size); @@ -64,8 +70,13 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr) devm_memunmap(dev, (void __force *) addr); } +static inline bool arch_has_pmem_api(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); +} + /** - * arch_has_pmem_api - true if wmb_pmem() ensures durability + * arch_has_wmb_pmem - true if wmb_pmem() ensures durability * * For a given cpu implementation within an architecture it is possible * that wmb_pmem() resolves to a nop. In the case this returns @@ -73,9 +84,9 @@ static inline void memunmap_pmem(struct device *dev, void __pmem *addr) * fall back to a different data consistency model, or otherwise notify * the user. */ -static inline bool arch_has_pmem_api(void) +static inline bool arch_has_wmb_pmem(void) { - return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && arch_has_wmb_pmem(); + return arch_has_pmem_api() && __arch_has_wmb_pmem(); } /* @@ -120,13 +131,8 @@ static inline void default_clear_pmem(void __pmem *addr, size_t size) static inline void __pmem *memremap_pmem(struct device *dev, resource_size_t offset, unsigned long size) { -#ifdef ARCH_MEMREMAP_PMEM return (void __pmem *) devm_memremap(dev, offset, size, ARCH_MEMREMAP_PMEM); -#else - return (void __pmem *) devm_memremap(dev, offset, size, - MEMREMAP_WT); -#endif } /** @@ -158,8 +164,10 @@ static inline void memcpy_to_pmem(void __pmem *dst, const void *src, size_t n) */ static inline void wmb_pmem(void) { - if (arch_has_pmem_api()) + if (arch_has_wmb_pmem()) arch_wmb_pmem(); + else + wmb(); } /** -- cgit v1.2.3-58-ga151