Merge tag 'cxl-for-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl

Pull cxl updates from Dan Williams: "Compute Express Link (CXL) updates for 6.0: - Introduce a 'struct cxl_region' object with support for provisioning and assembling persistent memory regions. - Introduce alloc_free_mem_region() to accompany the existing request_free_mem_region() as a method to allocate physical memory capacity out of an existing resource. - Export insert_resource_expand_to_fit() for the CXL subsystem to late-publish CXL platform windows in iomem_resource. - Add a polled mode PCI DOE (Data Object Exchange) driver service and use it in cxl_pci to retrieve the CDAT (Coherent Device Attribute Table)" * tag 'cxl-for-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl: (74 commits) cxl/hdm: Fix skip allocations vs multiple pmem allocations cxl/region: Disallow region granularity != window granularity cxl/region: Fix x1 interleave to greater than x1 interleave routing cxl/region: Move HPA setup to cxl_region_attach() cxl/region: Fix decoder interleave programming Documentation: cxl: remove dangling kernel-doc reference cxl/region: describe targets and nr_targets members of cxl_region_params cxl/regions: add padding for cxl_rr_ep_add nested lists cxl/region: Fix IS_ERR() vs NULL check cxl/region: Fix region reference target accounting cxl/region: Fix region commit uninitialized variable warning cxl/region: Fix port setup uninitialized variable warnings cxl/region: Stop initializing interleave granularity cxl/hdm: Fix DPA reservation vs cxl_endpoint_decoder lifetime cxl/acpi: Minimize granularity for x1 interleaves cxl/region: Delete 'region' attribute from root decoders cxl/acpi: Autoload driver for 'cxl_acpi' test devices cxl/region: decrement ->nr_targets on error in cxl_region_attach() cxl/region: prevent underflow in ways_to_cxl() cxl/region: uninitialized variable in alloc_hpa() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2022-08-10 11:07:26 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2022-08-10 11:07:26 -0700
commit: c235698355fa94df7073b51befda7d4be00a0e23 (patch)
tree: 3cb9d2fa40862f2484eda9c148b4326505a8534d
parent: 5e2e7383b57fa03ec2b00c82bb7f49a4a707c1f7 (diff)
parent: 1cd8a2537eb07751d405ab7e2223f20338a90506 (diff)
39 files changed, 5498 insertions, 576 deletions
diff --git a/.clang-format b/.clang-format
index 9b87ea1fc16e..1247d54f9e49 100644
--- a/.clang-format
+++ b/.clang-format
@@ -516,6 +516,7 @@ ForEachMacros:
   - 'of_property_for_each_string'
   - 'of_property_for_each_u32'
   - 'pci_bus_for_each_resource'
+  - 'pci_doe_for_each_off'
   - 'pcl_for_each_chunk'
   - 'pcl_for_each_segment'
   - 'pcm_for_each_format'
diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index 7c2b846521f3..8494ef27e8d2 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -7,6 +7,7 @@ Description:
 		all descendant memdevs for unbind. Writing '1' to this attribute
 		flushes that work.
 
+
 What:		/sys/bus/cxl/devices/memX/firmware_version
 Date:		December, 2020
 KernelVersion:	v5.12
@@ -16,6 +17,7 @@ Description:
 		Memory Device Output Payload in the CXL-2.0
 		specification.
 
+
 What:		/sys/bus/cxl/devices/memX/ram/size
 Date:		December, 2020
 KernelVersion:	v5.12
@@ -25,6 +27,7 @@ Description:
 		identically named field in the Identify Memory Device Output
 		Payload in the CXL-2.0 specification.
 
+
 What:		/sys/bus/cxl/devices/memX/pmem/size
 Date:		December, 2020
 KernelVersion:	v5.12
@@ -34,6 +37,7 @@ Description:
 		identically named field in the Identify Memory Device Output
 		Payload in the CXL-2.0 specification.
 
+
 What:		/sys/bus/cxl/devices/memX/serial
 Date:		January, 2022
 KernelVersion:	v5.18
@@ -43,6 +47,7 @@ Description:
 		capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2
 		Memory Device PCIe Capabilities and Extended Capabilities.
 
+
 What:		/sys/bus/cxl/devices/memX/numa_node
 Date:		January, 2022
 KernelVersion:	v5.18
@@ -52,114 +57,334 @@ Description:
 		host PCI device for this memory device, emit the CPU node
 		affinity for this device.
 
+
 What:		/sys/bus/cxl/devices/*/devtype
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL device objects export the devtype attribute which mirrors
-		the same value communicated in the DEVTYPE environment variable
-		for uevents for devices on the "cxl" bus.
+		(RO) CXL device objects export the devtype attribute which
+		mirrors the same value communicated in the DEVTYPE environment
+		variable for uevents for devices on the "cxl" bus.
+
 
 What:		/sys/bus/cxl/devices/*/modalias
 Date:		December, 2021
 KernelVersion:	v5.18
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL device objects export the modalias attribute which mirrors
-		the same value communicated in the MODALIAS environment variable
-		for uevents for devices on the "cxl" bus.
+		(RO) CXL device objects export the modalias attribute which
+		mirrors the same value communicated in the MODALIAS environment
+		variable for uevents for devices on the "cxl" bus.
+
 
 What:		/sys/bus/cxl/devices/portX/uport
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL port objects are enumerated from either a platform firmware
-		device (ACPI0017 and ACPI0016) or PCIe switch upstream port with
-		CXL component registers. The 'uport' symlink connects the CXL
-		portX object to the device that published the CXL port
+		(RO) CXL port objects are enumerated from either a platform
+		firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream
+		port with CXL component registers. The 'uport' symlink connects
+		the CXL portX object to the device that published the CXL port
 		capability.
 
+
 What:		/sys/bus/cxl/devices/portX/dportY
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL port objects are enumerated from either a platform firmware
-		device (ACPI0017 and ACPI0016) or PCIe switch upstream port with
-		CXL component registers. The 'dportY' symlink identifies one or
-		more downstream ports that the upstream port may target in its
-		decode of CXL memory resources.  The 'Y' integer reflects the
-		hardware port unique-id used in the hardware decoder target
-		list.
+		(RO) CXL port objects are enumerated from either a platform
+		firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream
+		port with CXL component registers. The 'dportY' symlink
+		identifies one or more downstream ports that the upstream port
+		may target in its decode of CXL memory resources.  The 'Y'
+		integer reflects the hardware port unique-id used in the
+		hardware decoder target list.
+
 
 What:		/sys/bus/cxl/devices/decoderX.Y
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL decoder objects are enumerated from either a platform
+		(RO) CXL decoder objects are enumerated from either a platform
 		firmware description, or a CXL HDM decoder register set in a
 		PCIe device (see CXL 2.0 section 8.2.5.12 CXL HDM Decoder
 		Capability Structure). The 'X' in decoderX.Y represents the
 		cxl_port container of this decoder, and 'Y' represents the
 		instance id of a given decoder resource.
 
+
 What:		/sys/bus/cxl/devices/decoderX.Y/{start,size}
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		The 'start' and 'size' attributes together convey the physical
-		address base and number of bytes mapped in the decoder's decode
-		window. For decoders of devtype "cxl_decoder_root" the address
-		range is fixed. For decoders of devtype "cxl_decoder_switch" the
-		address is bounded by the decode range of the cxl_port ancestor
-		of the decoder's cxl_port, and dynamically updates based on the
-		active memory regions in that address space.
+		(RO) The 'start' and 'size' attributes together convey the
+		physical address base and number of bytes mapped in the
+		decoder's decode window. For decoders of devtype
+		"cxl_decoder_root" the address range is fixed. For decoders of
+		devtype "cxl_decoder_switch" the address is bounded by the
+		decode range of the cxl_port ancestor of the decoder's cxl_port,
+		and dynamically updates based on the active memory regions in
+		that address space.
+
 
 What:		/sys/bus/cxl/devices/decoderX.Y/locked
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		CXL HDM decoders have the capability to lock the configuration
-		until the next device reset. For decoders of devtype
-		"cxl_decoder_root" there is no standard facility to unlock them.
-		For decoders of devtype "cxl_decoder_switch" a secondary bus
-		reset, of the PCIe bridge that provides the bus for this
-		decoders uport, unlocks / resets the decoder.
+		(RO) CXL HDM decoders have the capability to lock the
+		configuration until the next device reset. For decoders of
+		devtype "cxl_decoder_root" there is no standard facility to
+		unlock them.  For decoders of devtype "cxl_decoder_switch" a
+		secondary bus reset, of the PCIe bridge that provides the bus
+		for this decoders uport, unlocks / resets the decoder.
+
 
 What:		/sys/bus/cxl/devices/decoderX.Y/target_list
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		Display a comma separated list of the current decoder target
-		configuration. The list is ordered by the current configured
-		interleave order of the decoder's dport instances. Each entry in
-		the list is a dport id.
+		(RO) Display a comma separated list of the current decoder
+		target configuration. The list is ordered by the current
+		configured interleave order of the decoder's dport instances.
+		Each entry in the list is a dport id.
+
 
 What:		/sys/bus/cxl/devices/decoderX.Y/cap_{pmem,ram,type2,type3}
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		When a CXL decoder is of devtype "cxl_decoder_root", it
+		(RO) When a CXL decoder is of devtype "cxl_decoder_root", it
 		represents a fixed memory window identified by platform
 		firmware. A fixed window may only support a subset of memory
 		types. The 'cap_*' attributes indicate whether persistent
 		memory, volatile memory, accelerator memory, and / or expander
 		memory may be mapped behind this decoder's memory window.
 
+
 What:		/sys/bus/cxl/devices/decoderX.Y/target_type
 Date:		June, 2021
 KernelVersion:	v5.14
 Contact:	linux-cxl@vger.kernel.org
 Description:
-		When a CXL decoder is of devtype "cxl_decoder_switch", it can
-		optionally decode either accelerator memory (type-2) or expander
-		memory (type-3). The 'target_type' attribute indicates the
-		current setting which may dynamically change based on what
+		(RO) When a CXL decoder is of devtype "cxl_decoder_switch", it
+		can optionally decode either accelerator memory (type-2) or
+		expander memory (type-3). The 'target_type' attribute indicates
+		the current setting which may dynamically change based on what
 		memory regions are activated in this decode hierarchy.
+
+
+What:		/sys/bus/cxl/devices/endpointX/CDAT
+Date:		July, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) If this sysfs entry is not present no DOE mailbox was
+		found to support CDAT data.  If it is present and the length of
+		the data is 0 reading the CDAT data failed.  Otherwise the CDAT
+		data is reported.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/mode
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
+		translates from a host physical address range, to a device local
+		address range. Device-local address ranges are further split
+		into a 'ram' (volatile memory) range and 'pmem' (persistent
+		memory) range. The 'mode' attribute emits one of 'ram', 'pmem',
+		'mixed', or 'none'. The 'mixed' indication is for error cases
+		when a decoder straddles the volatile/persistent partition
+		boundary, and 'none' indicates the decoder is not actively
+		decoding, or no DPA allocation policy has been set.
+
+		'mode' can be written, when the decoder is in the 'disabled'
+		state, with either 'ram' or 'pmem' to set the boundaries for the
+		next allocation.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/dpa_resource
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) When a CXL decoder is of devtype "cxl_decoder_endpoint",
+		and its 'dpa_size' attribute is non-zero, this attribute
+		indicates the device physical address (DPA) base address of the
+		allocation.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/dpa_size
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it
+		translates from a host physical address range, to a device local
+		address range. The range, base address plus length in bytes, of
+		DPA allocated to this decoder is conveyed in these 2 attributes.
+		Allocations can be mutated as long as the decoder is in the
+		disabled state. A write to 'dpa_size' releases the previous DPA
+		allocation and then attempts to allocate from the free capacity
+		in the device partition referred to by 'decoderX.Y/mode'.
+		Allocate and free requests can only be performed on the highest
+		instance number disabled decoder with non-zero size. I.e.
+		allocations are enforced to occur in increasing 'decoderX.Y/id'
+		order and frees are enforced to occur in decreasing
+		'decoderX.Y/id' order.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/interleave_ways
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) The number of targets across which this decoder's host
+		physical address (HPA) memory range is interleaved. The device
+		maps every Nth block of HPA (of size ==
+		'interleave_granularity') to consecutive DPA addresses. The
+		decoder's position in the interleave is determined by the
+		device's (endpoint or switch) switch ancestry. For root
+		decoders their interleave is specified by platform firmware and
+		they only specify a downstream target order for host bridges.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/interleave_granularity
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) The number of consecutive bytes of host physical address
+		space this decoder claims at address N before the decode rotates
+		to the next target in the interleave at address N +
+		interleave_granularity (assuming N is aligned to
+		interleave_granularity).
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/create_pmem_region
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Write a string in the form 'regionZ' to start the process
+		of defining a new persistent memory region (interleave-set)
+		within the decode range bounded by root decoder 'decoderX.Y'.
+		The value written must match the current value returned from
+		reading this attribute. An atomic compare exchange operation is
+		done on write to assign the requested id to a region and
+		allocate the region-id for the next creation attempt. EBUSY is
+		returned if the region name written does not match the current
+		cached value.
+
+
+What:		/sys/bus/cxl/devices/decoderX.Y/delete_region
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(WO) Write a string in the form 'regionZ' to delete that region,
+		provided it is currently idle / not bound to a driver.
+
+
+What:		/sys/bus/cxl/devices/regionZ/uuid
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Write a unique identifier for the region. This field must
+		be set for persistent regions and it must not conflict with the
+		UUID of another region.
+
+
+What:		/sys/bus/cxl/devices/regionZ/interleave_granularity
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Set the number of consecutive bytes each device in the
+		interleave set will claim. The possible interleave granularity
+		values are determined by the CXL spec and the participating
+		devices.
+
+
+What:		/sys/bus/cxl/devices/regionZ/interleave_ways
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Configures the number of devices participating in the
+		region is set by writing this value. Each device will provide
+		1/interleave_ways of storage for the region.
+
+
+What:		/sys/bus/cxl/devices/regionZ/size
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) System physical address space to be consumed by the region.
+		When written trigger the driver to allocate space out of the
+		parent root decoder's address space. When read the size of the
+		address space is reported and should match the span of the
+		region's resource attribute. Size shall be set after the
+		interleave configuration parameters. Once set it cannot be
+		changed, only freed by writing 0. The kernel makes no guarantees
+		that data is maintained over an address space freeing event, and
+		there is no guarantee that a free followed by an allocate
+		results in the same address being allocated.
+
+
+What:		/sys/bus/cxl/devices/regionZ/resource
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RO) A region is a contiguous partition of a CXL root decoder
+		address space. Region capacity is allocated by writing to the
+		size attribute, the resulting physical address space determined
+		by the driver is reflected here. It is therefore not useful to
+		read this before writing a value to the size attribute.
+
+
+What:		/sys/bus/cxl/devices/regionZ/target[0..N]
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Write an endpoint decoder object name to 'targetX' where X
+		is the intended position of the endpoint device in the region
+		interleave and N is the 'interleave_ways' setting for the
+		region. ENXIO is returned if the write results in an impossible
+		to map decode scenario, like the endpoint is unreachable at that
+		position relative to the root decoder interleave. EBUSY is
+		returned if the position in the region is already occupied, or
+		if the region is not in a state to accept interleave
+		configuration changes. EINVAL is returned if the object name is
+		not an endpoint decoder. Once all positions have been
+		successfully written a final validation for decode conflicts is
+		performed before activating the region.
+
+
+What:		/sys/bus/cxl/devices/regionZ/commit
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Write a boolean 'true' string value to this attribute to
+		trigger the region to transition from the software programmed
+		state to the actively decoding in hardware state. The commit
+		operation in addition to validating that the region is in proper
+		configured state, validates that the decoders are being
+		committed in spec mandated order (last committed decoder id +
+		1), and checks that the hardware accepts the commit request.
+		Reading this value indicates whether the region is committed or
+		not.
diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst
index db476bb170b6..5149ecdc53c7 100644
--- a/Documentation/driver-api/cxl/memory-devices.rst
+++ b/Documentation/driver-api/cxl/memory-devices.rst
@@ -362,6 +362,14 @@ CXL Core
 .. kernel-doc:: drivers/cxl/core/mbox.c
    :doc: cxl mbox
 
+CXL Regions
+-----------
+.. kernel-doc:: drivers/cxl/core/region.c
+   :doc: cxl core region
+
+.. kernel-doc:: drivers/cxl/core/region.c
+   :identifiers:
+
 External Interfaces
 ===================
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 7b0d286bf9ba..01772e79fd93 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -55,6 +55,7 @@ int memory_add_physaddr_to_nid(u64 start)
 {
 	return hot_add_scn_to_nid(start);
 }
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
 int __weak create_section_mapping(unsigned long start, unsigned long end,
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index f64e3984689f..768ced3d6fe8 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -2,6 +2,7 @@
 menuconfig CXL_BUS
 	tristate "CXL (Compute Express Link) Devices Support"
 	depends on PCI
+	select PCI_DOE
 	help
 	  CXL is a bus that is electrically compatible with PCI Express, but
 	  layers three protocols on that signalling (CXL.io, CXL.cache, and
@@ -102,4 +103,12 @@ config CXL_SUSPEND
 	def_bool y
 	depends on SUSPEND && CXL_MEM
 
+config CXL_REGION
+	bool
+	default CXL_BUS
+	# For MAX_PHYSMEM_BITS
+	depends on SPARSEMEM
+	select MEMREGION
+	select GET_FREE_REGION
+
 endif
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 40286f5df812..fb649683dd3a 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -9,10 +9,6 @@
 #include "cxlpci.h"
 #include "cxl.h"
 
-/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */
-#define CFMWS_INTERLEAVE_WAYS(x)	(1 << (x)->interleave_ways)
-#define CFMWS_INTERLEAVE_GRANULARITY(x)	((x)->granularity + 8)
-
 static unsigned long cfmws_to_decoder_flags(int restrictions)
 {
 	unsigned long flags = CXL_DECODER_F_ENABLE;
@@ -34,7 +30,8 @@ static unsigned long cfmws_to_decoder_flags(int restrictions)
 static int cxl_acpi_cfmws_verify(struct device *dev,
 				 struct acpi_cedt_cfmws *cfmws)
 {
-	int expected_len;
+	int rc, expected_len;
+	unsigned int ways;
 
 	if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) {
 		dev_err(dev, "CFMWS Unsupported Interleave Arithmetic\n");
@@ -51,14 +48,14 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
 		return -EINVAL;
 	}
 
-	if (CFMWS_INTERLEAVE_WAYS(cfmws) > CXL_DECODER_MAX_INTERLEAVE) {
-		dev_err(dev, "CFMWS Interleave Ways (%d) too large\n",
-			CFMWS_INTERLEAVE_WAYS(cfmws));
+	rc = cxl_to_ways(cfmws->interleave_ways, &ways);
+	if (rc) {
+		dev_err(dev, "CFMWS Interleave Ways (%d) invalid\n",
+			cfmws->interleave_ways);
 		return -EINVAL;
 	}
 
-	expected_len = struct_size((cfmws), interleave_targets,
-				   CFMWS_INTERLEAVE_WAYS(cfmws));
+	expected_len = struct_size(cfmws, interleave_targets, ways);
 
 	if (cfmws->header.length < expected_len) {
 		dev_err(dev, "CFMWS length %d less than expected %d\n",
@@ -76,6 +73,8 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
 struct cxl_cfmws_context {
 	struct device *dev;
 	struct cxl_port *root_port;
+	struct resource *cxl_res;
+	int id;
 };
 
 static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
@@ -84,10 +83,14 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	int target_map[CXL_DECODER_MAX_INTERLEAVE];
 	struct cxl_cfmws_context *ctx = arg;
 	struct cxl_port *root_port = ctx->root_port;
+	struct resource *cxl_res = ctx->cxl_res;
+	struct cxl_root_decoder *cxlrd;
 	struct device *dev = ctx->dev;
 	struct acpi_cedt_cfmws *cfmws;
 	struct cxl_decoder *cxld;
-	int rc, i;
+	unsigned int ways, i, ig;
+	struct resource *res;
+	int rc;
 
 	cfmws = (struct acpi_cedt_cfmws *) header;
 
@@ -99,19 +102,51 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 		return 0;
 	}
 
-	for (i = 0; i < CFMWS_INTERLEAVE_WAYS(cfmws); i++)
+	rc = cxl_to_ways(cfmws->interleave_ways, &ways);
+	if (rc)
+		return rc;
+	rc = cxl_to_granularity(cfmws->granularity, &ig);
+	if (rc)
+		return rc;
+	for (i = 0; i < ways; i++)
 		target_map[i] = cfmws->interleave_targets[i];
 
-	cxld = cxl_root_decoder_alloc(root_port, CFMWS_INTERLEAVE_WAYS(cfmws));
-	if (IS_ERR(cxld))
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++);
+	if (!res->name)
+		goto err_name;
+
+	res->start = cfmws->base_hpa;
+	res->end = cfmws->base_hpa + cfmws->window_size - 1;
+	res->flags = IORESOURCE_MEM;
+
+	/* add to the local resource tracking to establish a sort order */
+	rc = insert_resource(cxl_res, res);
+	if (rc)
+		goto err_insert;
+
+	cxlrd = cxl_root_decoder_alloc(root_port, ways);
+	if (IS_ERR(cxlrd))
 		return 0;
 
+	cxld = &cxlrd->cxlsd.cxld;
 	cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
 	cxld->target_type = CXL_DECODER_EXPANDER;
-	cxld->platform_res = (struct resource)DEFINE_RES_MEM(cfmws->base_hpa,
-							     cfmws->window_size);
-	cxld->interleave_ways = CFMWS_INTERLEAVE_WAYS(cfmws);
-	cxld->interleave_granularity = CFMWS_INTERLEAVE_GRANULARITY(cfmws);
+	cxld->hpa_range = (struct range) {
+		.start = res->start,
+		.end = res->end,
+	};
+	cxld->interleave_ways = ways;
+	/*
+	 * Minimize the x1 granularity to advertise support for any
+	 * valid region granularity
+	 */
+	if (ways == 1)
+		ig = CXL_DECODER_MIN_GRANULARITY;
+	cxld->interleave_granularity = ig;
 
 	rc = cxl_decoder_add(cxld, target_map);
 	if (rc)
@@ -119,15 +154,22 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
 	else
 		rc = cxl_decoder_autoremove(dev, cxld);
 	if (rc) {
-		dev_err(dev, "Failed to add decoder for %pr\n",
-			&cxld->platform_res);
+		dev_err(dev, "Failed to add decode range [%#llx - %#llx]\n",
+			cxld->hpa_range.start, cxld->hpa_range.end);
 		return 0;
 	}
-	dev_dbg(dev, "add: %s node: %d range %pr\n", dev_name(&cxld->dev),
-		phys_to_target_node(cxld->platform_res.start),
-		&cxld->platform_res);
+	dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n",
+		dev_name(&cxld->dev),
+		phys_to_target_node(cxld->hpa_range.start),
+		cxld->hpa_range.start, cxld->hpa_range.end);
 
 	return 0;
+
+err_insert:
+	kfree(res->name);
+err_name:
+	kfree(res);
+	return -ENOMEM;
 }
 
 __mock struct acpi_device *to_cxl_host_bridge(struct device *host,
@@ -175,8 +217,7 @@ static int add_host_bridge_uport(struct device *match, void *arg)
 	if (rc)
 		return rc;
 
-	port = devm_cxl_add_port(host, match, dport->component_reg_phys,
-				 root_port);
+	port = devm_cxl_add_port(host, match, dport->component_reg_phys, dport);
 	if (IS_ERR(port))
 		return PTR_ERR(port);
 	dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev));
@@ -282,9 +323,127 @@ static void cxl_acpi_lock_reset_class(void *dev)
 	device_lock_reset_class(dev);
 }
 
+static void del_cxl_resource(struct resource *res)
+{
+	kfree(res->name);
+	kfree(res);
+}
+
+static void cxl_set_public_resource(struct resource *priv, struct resource *pub)
+{
+	priv->desc = (unsigned long) pub;
+}
+
+static struct resource *cxl_get_public_resource(struct resource *priv)
+{
+	return (struct resource *) priv->desc;
+}
+
+static void remove_cxl_resources(void *data)
+{
+	struct resource *res, *next, *cxl = data;
+
+	for (res = cxl->child; res; res = next) {
+		struct resource *victim = cxl_get_public_resource(res);
+
+		next = res->sibling;
+		remove_resource(res);
+
+		if (victim) {
+			remove_resource(victim);
+			kfree(victim);
+		}
+
+		del_cxl_resource(res);
+	}
+}
+
+/**
+ * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource
+ * @cxl_res: A standalone resource tree where each CXL window is a sibling
+ *
+ * Walk each CXL window in @cxl_res and add it to iomem_resource potentially
+ * expanding its boundaries to ensure that any conflicting resources become
+ * children. If a window is expanded it may then conflict with a another window
+ * entry and require the window to be truncated or trimmed. Consider this
+ * situation:
+ *
+ * |-- "CXL Window 0" --||----- "CXL Window 1" -----|
+ * |--------------- "System RAM" -------------|
+ *
+ * ...where platform firmware has established as System RAM resource across 2
+ * windows, but has left some portion of window 1 for dynamic CXL region
+ * provisioning. In this case "Window 0" will span the entirety of the "System
+ * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end
+ * of that "System RAM" resource.
+ */
+static int add_cxl_resources(struct resource *cxl_res)
+{
+	struct resource *res, *new, *next;
+
+	for (res = cxl_res->child; res; res = next) {
+		new = kzalloc(sizeof(*new), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		new->name = res->name;
+		new->start = res->start;
+		new->end = res->end;
+		new->flags = IORESOURCE_MEM;
+		new->desc = IORES_DESC_CXL;
+
+		/*
+		 * Record the public resource in the private cxl_res tree for
+		 * later removal.
+		 */
+		cxl_set_public_resource(res, new);
+
+		insert_resource_expand_to_fit(&iomem_resource, new);
+
+		next = res->sibling;
+		while (next && resource_overlaps(new, next)) {
+			if (resource_contains(new, next)) {
+				struct resource *_next = next->sibling;
+
+				remove_resource(next);
+				del_cxl_resource(next);
+				next = _next;
+			} else
+				next->start = new->end + 1;
+		}
+	}
+	return 0;
+}
+
+static int pair_cxl_resource(struct device *dev, void *data)
+{
+	struct resource *cxl_res = data;
+	struct resource *p;
+
+	if (!is_root_decoder(dev))
+		return 0;
+
+	for (p = cxl_res->child; p; p = p->sibling) {
+		struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+		struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+		struct resource res = {
+			.start = cxld->hpa_range.start,
+			.end = cxld->hpa_range.end,
+			.flags = IORESOURCE_MEM,
+		};
+
+		if (resource_contains(p, &res)) {
+			cxlrd->res = cxl_get_public_resource(p);
+			break;
+		}
+	}
+
+	return 0;
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
 	int rc;
+	struct resource *cxl_res;
 	struct cxl_port *root_port;
 	struct device *host = &pdev->dev;
 	struct acpi_device *adev = ACPI_COMPANION(host);
@@ -296,6 +455,14 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc)
 		return rc;
 
+	cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL);
+	if (!cxl_res)
+		return -ENOMEM;
+	cxl_res->name = "CXL mem";
+	cxl_res->start = 0;
+	cxl_res->end = -1;
+	cxl_res->flags = IORESOURCE_MEM;
+
 	root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
 	if (IS_ERR(root_port))
 		return PTR_ERR(root_port);
@@ -306,11 +473,28 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 	if (rc < 0)
 		return rc;
 
+	rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res);
+	if (rc)
+		return rc;
+
 	ctx = (struct cxl_cfmws_context) {
 		.dev = host,
 		.root_port = root_port,
+		.cxl_res = cxl_res,
 	};
-	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+	if (rc < 0)
+		return -ENXIO;
+
+	rc = add_cxl_resources(cxl_res);
+	if (rc)
+		return rc;
+
+	/*
+	 * Populate the root decoders with their related iomem resource,
+	 * if present
+	 */
+	device_for_each_child(&root_port->dev, cxl_res, pair_cxl_resource);
 
 	/*
 	 * Root level scanned with host-bridge as dports, now scan host-bridges
@@ -337,12 +521,19 @@ static const struct acpi_device_id cxl_acpi_ids[] = {
 };
 MODULE_DEVICE_TABLE(acpi, cxl_acpi_ids);
 
+static const struct platform_device_id cxl_test_ids[] = {
+	{ "cxl_acpi" },
+	{ },
+};
+MODULE_DEVICE_TABLE(platform, cxl_test_ids);
+
 static struct platform_driver cxl_acpi_driver = {
 	.probe = cxl_acpi_probe,
 	.driver = {
 		.name = KBUILD_MODNAME,
 		.acpi_match_table = cxl_acpi_ids,
 	},
+	.id_table = cxl_test_ids,
 };
 
 module_platform_driver(cxl_acpi_driver);
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 9d35085d25af..79c7257f4107 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -10,3 +10,4 @@ cxl_core-y += memdev.o
 cxl_core-y += mbox.o
 cxl_core-y += pci.o
 cxl_core-y += hdm.o
+cxl_core-$(CONFIG_CXL_REGION) += region.o
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 1a50c0fc399c..1d8f87be283f 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -9,6 +9,36 @@ extern const struct device_type cxl_nvdimm_type;
 
 extern struct attribute_group cxl_base_attribute_group;
 
+#ifdef CONFIG_CXL_REGION
+extern struct device_attribute dev_attr_create_pmem_region;
+extern struct device_attribute dev_attr_delete_region;
+extern struct device_attribute dev_attr_region;
+extern const struct device_type cxl_pmem_region_type;
+extern const struct device_type cxl_region_type;
+void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled);
+#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
+#define CXL_REGION_TYPE(x) (&cxl_region_type)
+#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
+#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type)
+int cxl_region_init(void);
+void cxl_region_exit(void);
+#else
+static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
+{
+}
+static inline int cxl_region_init(void)
+{
+	return 0;
+}
+static inline void cxl_region_exit(void)
+{
+}
+#define CXL_REGION_ATTR(x) NULL
+#define CXL_REGION_TYPE(x) NULL
+#define SET_CXL_REGION_ATTR(x)
+#define CXL_PMEM_REGION_TYPE(x) NULL
+#endif
+
 struct cxl_send_command;
 struct cxl_mem_query_commands;
 int cxl_query_cmd(struct cxl_memdev *cxlmd,
@@ -17,9 +47,28 @@ int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s);
 void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
 				   resource_size_t length);
 
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
+		     enum cxl_decoder_mode mode);
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size);
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled);
+extern struct rw_semaphore cxl_dpa_rwsem;
+
+bool is_switch_decoder(struct device *dev);
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
+static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
+					 struct cxl_memdev *cxlmd)
+{
+	if (!port)
+		return NULL;
+
+	return xa_load(&port->endpoints, (unsigned long)&cxlmd->dev);
+}
+
 int cxl_memdev_init(void);
 void cxl_memdev_exit(void);
 void cxl_mbox_init(void);
-void cxl_mbox_exit(void);
 
 #endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index bfc8ee876278..d1d2caea5c62 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
 #include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/delay.h>
 
@@ -16,6 +17,8 @@
  * for enumerating these registers and capabilities.
  */
 
+DECLARE_RWSEM(cxl_dpa_rwsem);
+
 static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 			   int *target_map)
 {
@@ -46,20 +49,22 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
  */
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port)
 {
-	struct cxl_decoder *cxld;
-	struct cxl_dport *dport;
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_dport *dport = NULL;
 	int single_port_map[1];
+	unsigned long index;
 
-	cxld = cxl_switch_decoder_alloc(port, 1);
-	if (IS_ERR(cxld))
-		return PTR_ERR(cxld);
+	cxlsd = cxl_switch_decoder_alloc(port, 1);
+	if (IS_ERR(cxlsd))
+		return PTR_ERR(cxlsd);
 
 	device_lock_assert(&port->dev);
 
-	dport = list_first_entry(&port->dports, typeof(*dport), list);
+	xa_for_each(&port->dports, index, dport)
+		break;
 	single_port_map[0] = dport->port_id;
 
-	return add_hdm_decoder(port, cxld, single_port_map);
+	return add_hdm_decoder(port, &cxlsd->cxld, single_port_map);
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, CXL);
 
@@ -124,47 +129,577 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port)
 		return ERR_PTR(-ENXIO);
 	}
 
+	dev_set_drvdata(dev, cxlhdm);
+
 	return cxlhdm;
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL);
 
-static int to_interleave_granularity(u32 ctrl)
+static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth)
+{
+	unsigned long long start = r->start, end = r->end;
+
+	seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end,
+		   r->name);
+}
+
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
+{
+	struct resource *p1, *p2;
+
+	down_read(&cxl_dpa_rwsem);
+	for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
+		__cxl_dpa_debug(file, p1, 0);
+		for (p2 = p1->child; p2; p2 = p2->sibling)
+			__cxl_dpa_debug(file, p2, 1);
+	}
+	up_read(&cxl_dpa_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL);
+
+/*
+ * Must be called in a context that synchronizes against this decoder's
+ * port ->remove() callback (like an endpoint decoder sysfs attribute)
+ */
+static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct resource *res = cxled->dpa_res;
+	resource_size_t skip_start;
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+
+	/* save @skip_start, before @res is released */
+	skip_start = res->start - cxled->skip;
+	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
+	if (cxled->skip)
+		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+	cxled->skip = 0;
+	cxled->dpa_res = NULL;
+	put_device(&cxled->cxld.dev);
+	port->hdm_end--;
+}
+
+static void cxl_dpa_release(void *cxled)
+{
+	down_write(&cxl_dpa_rwsem);
+	__cxl_dpa_release(cxled);
+	up_write(&cxl_dpa_rwsem);
+}
+
+/*
+ * Must be called from context that will not race port device
+ * unregistration, like decoder sysfs attribute methods
+ */
+static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = cxled_to_port(cxled);
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+	devm_remove_action(&port->dev, cxl_dpa_release, cxled);
+	__cxl_dpa_release(cxled);
+}
+
+static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+			     resource_size_t base, resource_size_t len,
+			     resource_size_t skipped)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &port->dev;
+	struct resource *res;
+
+	lockdep_assert_held_write(&cxl_dpa_rwsem);
+
+	if (!len)
+		goto success;
+
+	if (cxled->dpa_res) {
+		dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n",
+			port->id, cxled->cxld.id, cxled->dpa_res);
+		return -EBUSY;
+	}
+
+	if (port->hdm_end + 1 != cxled->cxld.id) {
+		/*
+		 * Assumes alloc and commit order is always in hardware instance
+		 * order per expectations from 8.2.5.12.20 Committing Decoder
+		 * Programming that enforce decoder[m] committed before
+		 * decoder[m+1] commit start.
+		 */
+		dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id,
+			cxled->cxld.id, port->id, port->hdm_end + 1);
+		return -EBUSY;
+	}
+
+	if (skipped) {
+		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
+				       dev_name(&cxled->cxld.dev), 0);
+		if (!res) {
+			dev_dbg(dev,
+				"decoder%d.%d: failed to reserve skipped space\n",
+				port->id, cxled->cxld.id);
+			return -EBUSY;
+		}
+	}
+	res = __request_region(&cxlds->dpa_res, base, len,
+			       dev_name(&cxled->cxld.dev), 0);
+	if (!res) {
+		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
+			port->id, cxled->cxld.id);
+		if (skipped)
+			__release_region(&cxlds->dpa_res, base - skipped,
+					 skipped);
+		return -EBUSY;
+	}
+	cxled->dpa_res = res;
+	cxled->skip = skipped;
+
+	if (resource_contains(&cxlds->pmem_res, res))
+		cxled->mode = CXL_DECODER_PMEM;
+	else if (resource_contains(&cxlds->ram_res, res))
+		cxled->mode = CXL_DECODER_RAM;
+	else {
+		dev_dbg(dev, "decoder%d.%d: %pr mixed\n", port->id,
+			cxled->cxld.id, cxled->dpa_res);
+		cxled->mode = CXL_DECODER_MIXED;
+	}
+
+success:
+	port->hdm_end++;
+	get_device(&cxled->cxld.dev);
+	return 0;
+}
+
+static int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+				resource_size_t base, resource_size_t len,
+				resource_size_t skipped)
 {
-	int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl);
+	struct cxl_port *port = cxled_to_port(cxled);
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	rc = __cxl_dpa_reserve(cxled, base, len, skipped);
+	up_write(&cxl_dpa_rwsem);
+
+	if (rc)
+		return rc;
 
-	return 256 << val;
+	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
 }
 
-static int to_interleave_ways(u32 ctrl)
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
 {
-	int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl);
+	resource_size_t size = 0;
+
+	down_read(&cxl_dpa_rwsem);
+	if (cxled->dpa_res)
+		size = resource_size(cxled->dpa_res);
+	up_read(&cxl_dpa_rwsem);
 
-	switch (val) {
-	case 0 ... 4:
-		return 1 << val;
-	case 8 ... 10:
-		return 3 << (val - 8);
+	return size;
+}
+
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
+{
+	resource_size_t base = -1;
+
+	down_read(&cxl_dpa_rwsem);
+	if (cxled->dpa_res)
+		base = cxled->dpa_res->start;
+	up_read(&cxl_dpa_rwsem);
+
+	return base;
+}
+
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct device *dev = &cxled->cxld.dev;
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	if (!cxled->dpa_res) {
+		rc = 0;
+		goto out;
+	}
+	if (cxled->cxld.region) {
+		dev_dbg(dev, "decoder assigned to: %s\n",
+			dev_name(&cxled->cxld.region->dev));
+		rc = -EBUSY;
+		goto out;
+	}
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		dev_dbg(dev, "decoder enabled\n");
+		rc = -EBUSY;
+		goto out;
+	}
+	if (cxled->cxld.id != port->hdm_end) {
+		dev_dbg(dev, "expected decoder%d.%d\n", port->id,
+			port->hdm_end);
+		rc = -EBUSY;
+		goto out;
+	}
+	devm_cxl_dpa_release(cxled);
+	rc = 0;
+out:
+	up_write(&cxl_dpa_rwsem);
+	return rc;
+}
+
+int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
+		     enum cxl_decoder_mode mode)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &cxled->cxld.dev;
+	int rc;
+
+	switch (mode) {
+	case CXL_DECODER_RAM:
+	case CXL_DECODER_PMEM:
+		break;
 	default:
+		dev_dbg(dev, "unsupported mode: %d\n", mode);
+		return -EINVAL;
+	}
+
+	down_write(&cxl_dpa_rwsem);
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * Only allow modes that are supported by the current partition
+	 * configuration
+	 */
+	if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) {
+		dev_dbg(dev, "no available pmem capacity\n");
+		rc = -ENXIO;
+		goto out;
+	}
+	if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) {
+		dev_dbg(dev, "no available ram capacity\n");
+		rc = -ENXIO;
+		goto out;
+	}
+
+	cxled->mode = mode;
+	rc = 0;
+out:
+	up_write(&cxl_dpa_rwsem);
+
+	return rc;
+}
+
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	resource_size_t free_ram_start, free_pmem_start;
+	struct cxl_port *port = cxled_to_port(cxled);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct device *dev = &cxled->cxld.dev;
+	resource_size_t start, avail, skip;
+	struct resource *p, *last;
+	int rc;
+
+	down_write(&cxl_dpa_rwsem);
+	if (cxled->cxld.region) {
+		dev_dbg(dev, "decoder attached to %s\n",
+			dev_name(&cxled->cxld.region->dev));
+		rc = -EBUSY;
+		goto out;
+	}
+
+	if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+		dev_dbg(dev, "decoder enabled\n");
+		rc = -EBUSY;
+		goto out;
+	}
+
+	for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling)
+		last = p;
+	if (last)
+		free_ram_start = last->end + 1;
+	else
+		free_ram_start = cxlds->ram_res.start;
+
+	for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling)
+		last = p;
+	if (last)
+		free_pmem_start = last->end + 1;
+	else
+		free_pmem_start = cxlds->pmem_res.start;
+
+	if (cxled->mode == CXL_DECODER_RAM) {
+		start = free_ram_start;
+		avail = cxlds->ram_res.end - start + 1;
+		skip = 0;
+	} else if (cxled->mode == CXL_DECODER_PMEM) {
+		resource_size_t skip_start, skip_end;
+
+		start = free_pmem_start;
+		avail = cxlds->pmem_res.end - start + 1;
+		skip_start = free_ram_start;
+
+		/*
+		 * If some pmem is already allocated, then that allocation
+		 * already handled the skip.
+		 */
+		if (cxlds->pmem_res.child &&
+		    skip_start == cxlds->pmem_res.child->start)
+			skip_end = skip_start - 1;
+		else
+			skip_end = start - 1;
+		skip = skip_end - skip_start + 1;
+	} else {
+		dev_dbg(dev, "mode not set\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (size > avail) {
+		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
+			cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem",
+			&avail);
+		rc = -ENOSPC;
+		goto out;
+	}
+
+	rc = __cxl_dpa_reserve(cxled, start, size, skip);
+out:
+	up_write(&cxl_dpa_rwsem);
+
+	if (rc)
+		return rc;
+
+	return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
+}
+
+static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl)
+{
+	u16 eig;
+	u8 eiw;
+
+	/*
+	 * Input validation ensures these warns never fire, but otherwise
+	 * suppress unititalized variable usage warnings.
+	 */
+	if (WARN_ONCE(ways_to_cxl(cxld->interleave_ways, &eiw),
+		      "invalid interleave_ways: %d\n", cxld->interleave_ways))
+		return;
+	if (WARN_ONCE(granularity_to_cxl(cxld->interleave_granularity, &eig),
+		      "invalid interleave_granularity: %d\n",
+		      cxld->interleave_granularity))
+		return;
+
+	u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK);
+	u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK);
+	*ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT;
+}
+
+static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl)
+{
+	u32p_replace_bits(ctrl, !!(cxld->target_type == 3),
+			  CXL_HDM_DECODER0_CTRL_TYPE);
+}
+
+static int cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt)
+{
+	struct cxl_dport **t = &cxlsd->target[0];
+	int ways = cxlsd->cxld.interleave_ways;
+
+	if (dev_WARN_ONCE(&cxlsd->cxld.dev,
+			  ways > 8 || ways > cxlsd->nr_targets,
+			  "ways: %d overflows targets: %d\n", ways,
+			  cxlsd->nr_targets))
+		return -ENXIO;
+
+	*tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id);
+	if (ways > 1)
+		*tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id);
+	if (ways > 2)
+		*tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id);
+	if (ways > 3)
+		*tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id);
+	if (ways > 4)
+		*tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id);
+	if (ways > 5)
+		*tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id);
+	if (ways > 6)
+		*tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id);
+	if (ways > 7)
+		*tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id);
+
+	return 0;
+}
+
+/*
+ * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set
+ * committed or error within 10ms, but just be generous with 20ms to account for
+ * clock skew and other marginal behavior
+ */
+#define COMMIT_TIMEOUT_MS 20
+static int cxld_await_commit(void __iomem *hdm, int id)
+{
+	u32 ctrl;
+	int i;
+
+	for (i = 0; i < COMMIT_TIMEOUT_MS; i++) {
+		ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) {
+			ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+			writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+			return -EIO;
+		}
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl))
+			return 0;
+		fsleep(1000);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static int cxl_decoder_commit(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	int id = cxld->id, rc;
+	u64 base, size;
+	u32 ctrl;
+
+	if (cxld->flags & CXL_DECODER_F_ENABLE)
+		return 0;
+
+	if (port->commit_end + 1 != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order commit, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id, port->commit_end + 1);
+		return -EBUSY;
+	}
+
+	down_read(&cxl_dpa_rwsem);
+	/* common decoder settings */
+	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id));
+	cxld_set_interleave(cxld, &ctrl);
+	cxld_set_type(cxld, &ctrl);
+	base = cxld->hpa_range.start;
+	size = range_len(&cxld->hpa_range);
+
+	writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+	writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+	writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+	writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+
+	if (is_switch_decoder(&cxld->dev)) {
+		struct cxl_switch_decoder *cxlsd =
+			to_cxl_switch_decoder(&cxld->dev);
+		void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id);
+		void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id);
+		u64 targets;
+
+		rc = cxlsd_set_targets(cxlsd, &targets);
+		if (rc) {
+			dev_dbg(&port->dev, "%s: target configuration error\n",
+				dev_name(&cxld->dev));
+			goto err;
+		}
+
+		writel(upper_32_bits(targets), tl_hi);
+		writel(lower_32_bits(targets), tl_lo);
+	} else {
+		struct cxl_endpoint_decoder *cxled =
+			to_cxl_endpoint_decoder(&cxld->dev);
+		void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id);
+		void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id);
+
+		writel(upper_32_bits(cxled->skip), sk_hi);
+		writel(lower_32_bits(cxled->skip), sk_lo);
+	}
+
+	writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+	up_read(&cxl_dpa_rwsem);
+
+	port->commit_end++;
+	rc = cxld_await_commit(hdm, cxld->id);
+err:
+	if (rc) {
+		dev_dbg(&port->dev, "%s: error %d committing decoder\n",
+			dev_name(&cxld->dev), rc);
+		cxld->reset(cxld);
+		return rc;
+	}
+	cxld->flags |= CXL_DECODER_F_ENABLE;
+
+	return 0;
+}
+
+static int cxl_decoder_reset(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+	int id = cxld->id;
+	u32 ctrl;
+
+	if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
 		return 0;
+
+	if (port->commit_end != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order reset, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id, port->commit_end);
+		return -EBUSY;
 	}
+
+	down_read(&cxl_dpa_rwsem);
+	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+	ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+	writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+
+	writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+	writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+	up_read(&cxl_dpa_rwsem);
+
+	port->commit_end--;
+	cxld->flags &= ~CXL_DECODER_F_ENABLE;
+
+	return 0;
 }
 
 static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
-			    int *target_map, void __iomem *hdm, int which)
+			    int *target_map, void __iomem *hdm, int which,
+			    u64 *dpa_base)
 {
-	u64 size, base;
+	struct cxl_endpoint_decoder *cxled = NULL;
+	u64 size, base, skip, dpa_size;
+	bool committed;
+	u32 remainder;
+	int i, rc;
 	u32 ctrl;
-	int i;
 	union {
 		u64 value;
 		unsigned char target_id[8];
 	} target_list;
 
+	if (is_endpoint_decoder(&cxld->dev))
+		cxled = to_cxl_endpoint_decoder(&cxld->dev);
+
 	ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
 	base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which));
 	size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which));
+	committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED);
+	cxld->commit = cxl_decoder_commit;
+	cxld->reset = cxl_decoder_reset;
 
-	if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED))
+	if (!committed)
 		size = 0;
 	if (base == U64_MAX || size == U64_MAX) {
 		dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n",
@@ -172,39 +707,77 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
 		return -ENXIO;
 	}
 
-	cxld->decoder_range = (struct range) {
+	cxld->hpa_range = (struct range) {
 		.start = base,
 		.end = base + size - 1,
 	};
 
-	/* switch decoders are always enabled if committed */
-	if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) {
+	/* decoders are enabled if committed */
+	if (committed) {
 		cxld->flags |= CXL_DECODER_F_ENABLE;
 		if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK)
 			cxld->flags |= CXL_DECODER_F_LOCK;
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl))
+			cxld->target_type = CXL_DECODER_EXPANDER;
+		else
+			cxld->target_type = CXL_DECODER_ACCELERATOR;
+		if (cxld->id != port->commit_end + 1) {
+			dev_warn(&port->dev,
+				 "decoder%d.%d: Committed out of order\n",
+				 port->id, cxld->id);
+			return -ENXIO;
+		}
+		port->commit_end = cxld->id;
+	} else {
+		/* unless / until type-2 drivers arrive, assume type-3 */
+		if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl) == 0) {
+			ctrl |= CXL_HDM_DECODER0_CTRL_TYPE;
+			writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
+		}
+		cxld->target_type = CXL_DECODER_EXPANDER;
 	}
-	cxld->interleave_ways = to_interleave_ways(ctrl);
-	if (!cxld->interleave_ways) {
+	rc = cxl_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl),
+			 &cxld->interleave_ways);
+	if (rc) {
 		dev_warn(&port->dev,
 			 "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n",
 			 port->id, cxld->id, ctrl);
-		return -ENXIO;
+		return rc;
 	}
-	cxld->interleave_granularity = to_interleave_granularity(ctrl);
+	rc = cxl_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl),
+				&cxld->interleave_granularity);
+	if (rc)
+		return rc;
 
-	if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl))
-		cxld->target_type = CXL_DECODER_EXPANDER;
-	else
-		cxld->target_type = CXL_DECODER_ACCELERATOR;
+	if (!cxled) {
+		target_list.value =
+			ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which));
+		for (i = 0; i < cxld->interleave_ways; i++)
+			target_map[i] = target_list.target_id[i];
 
-	if (is_endpoint_decoder(&cxld->dev))
 		return 0;
+	}
 
-	target_list.value =
-		ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which));
-	for (i = 0; i < cxld->interleave_ways; i++)
-		target_map[i] = target_list.target_id[i];
+	if (!committed)
+		return 0;
 
+	dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder);
+	if (remainder) {
+		dev_err(&port->dev,
+			"decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n",
+			port->id, cxld->id, size, cxld->interleave_ways);
+		return -ENXIO;
+	}
+	skip = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SKIP_LOW(which));
+	rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip);
+	if (rc) {
+		dev_err(&port->dev,
+			"decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)",
+			port->id, cxld->id, *dpa_base,
+			*dpa_base + dpa_size + skip - 1, rc);
+		return rc;
+	}
+	*dpa_base += dpa_size + skip;
 	return 0;
 }
 
@@ -216,7 +789,8 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 {
 	void __iomem *hdm = cxlhdm->regs.hdm_decoder;
 	struct cxl_port *port = cxlhdm->port;
-	int i, committed, failed;
+	int i, committed;
+	u64 dpa_base = 0;
 	u32 ctrl;
 
 	/*
@@ -236,27 +810,37 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 	if (committed != cxlhdm->decoder_count)
 		msleep(20);
 
-	for (i = 0, failed = 0; i < cxlhdm->decoder_count; i++) {
+	for (i = 0; i < cxlhdm->decoder_count; i++) {
 		int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 };
 		int rc, target_count = cxlhdm->target_count;
 		struct cxl_decoder *cxld;
 
-		if (is_cxl_endpoint(port))
-			cxld = cxl_endpoint_decoder_alloc(port);
-		else
-			cxld = cxl_switch_decoder_alloc(port, target_count);
-		if (IS_ERR(cxld)) {
-			dev_warn(&port->dev,
-				 "Failed to allocate the decoder\n");
-			return PTR_ERR(cxld);
+		if (is_cxl_endpoint(port)) {
+			struct cxl_endpoint_decoder *cxled;
+
+			cxled = cxl_endpoint_decoder_alloc(port);
+			if (IS_ERR(cxled)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate the decoder\n");
+				return PTR_ERR(cxled);
+			}
+			cxld = &cxled->cxld;
+		} else {
+			struct cxl_switch_decoder *cxlsd;
+
+			cxlsd = cxl_switch_decoder_alloc(port, target_count);
+			if (IS_ERR(cxlsd)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate the decoder\n");
+				return PTR_ERR(cxlsd);
+			}
+			cxld = &cxlsd->cxld;
 		}
 
-		rc = init_hdm_decoder(port, cxld, target_map,
-				      cxlhdm->regs.hdm_decoder, i);
+		rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base);
 		if (rc) {
 			put_device(&cxld->dev);
-			failed++;
-			continue;
+			return rc;
 		}
 		rc = add_hdm_decoder(port, cxld, target_map);
 		if (rc) {
@@ -266,11 +850,6 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 		}
 	}
 
-	if (failed == cxlhdm->decoder_count) {
-		dev_err(&port->dev, "No valid decoders found\n");
-		return -ENXIO;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL);
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index cbf23beebebe..16176b9278b4 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -718,12 +718,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
  */
 static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds)
 {
-	struct cxl_mbox_get_partition_info {
-		__le64 active_volatile_cap;
-		__le64 active_persistent_cap;
-		__le64 next_volatile_cap;
-		__le64 next_persistent_cap;
-	} __packed pi;
+	struct cxl_mbox_get_partition_info pi;
 	int rc;
 
 	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_PARTITION_INFO, NULL, 0,
@@ -773,15 +768,6 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 	cxlds->partition_align_bytes =
 		le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER;
 
-	dev_dbg(cxlds->dev,
-		"Identify Memory Device\n"
-		"     total_bytes = %#llx\n"
-		"     volatile_only_bytes = %#llx\n"
-		"     persistent_only_bytes = %#llx\n"
-		"     partition_align_bytes = %#llx\n",
-		cxlds->total_bytes, cxlds->volatile_only_bytes,
-		cxlds->persistent_only_bytes, cxlds->partition_align_bytes);
-
 	cxlds->lsa_size = le32_to_cpu(id.lsa_size);
 	memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision));
 
@@ -789,42 +775,63 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL);
 
-int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
+static int add_dpa_res(struct device *dev, struct resource *parent,
+		       struct resource *res, resource_size_t start,
+		       resource_size_t size, const char *type)
 {
 	int rc;
 
-	if (cxlds->partition_align_bytes == 0) {
-		cxlds->ram_range.start = 0;
-		cxlds->ram_range.end = cxlds->volatile_only_bytes - 1;
-		cxlds->pmem_range.start = cxlds->volatile_only_bytes;
-		cxlds->pmem_range.end = cxlds->volatile_only_bytes +
-				       cxlds->persistent_only_bytes - 1;
+	res->name = type;
+	res->start = start;
+	res->end = start + size - 1;
+	res->flags = IORESOURCE_MEM;
+	if (resource_size(res) == 0) {
+		dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
 		return 0;
 	}
-
-	rc = cxl_mem_get_partition_info(cxlds);
+	rc = request_resource(parent, res);
 	if (rc) {
-		dev_err(cxlds->dev, "Failed to query partition information\n");
+		dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
+			res, rc);
 		return rc;
 	}
 
-	dev_dbg(cxlds->dev,
-		"Get Partition Info\n"
-		"     active_volatile_bytes = %#llx\n"
-		"     active_persistent_bytes = %#llx\n"
-		"     next_volatile_bytes = %#llx\n"
-		"     next_persistent_bytes = %#llx\n",
-		cxlds->active_volatile_bytes, cxlds->active_persistent_bytes,
-		cxlds->next_volatile_bytes, cxlds->next_persistent_bytes);
+	dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+
+	return 0;
+}
 
-	cxlds->ram_range.start = 0;
-	cxlds->ram_range.end = cxlds->active_volatile_bytes - 1;
+int cxl_mem_create_range_info(struct cxl_dev_state *cxlds)
+{
+	struct device *dev = cxlds->dev;
+	int rc;
 
-	cxlds->pmem_range.start = cxlds->active_volatile_bytes;
-	cxlds->pmem_range.end =
-		cxlds->active_volatile_bytes + cxlds->active_persistent_bytes - 1;
+	cxlds->dpa_res =
+		(struct resource)DEFINE_RES_MEM(0, cxlds->total_bytes);
 
-	return 0;
+	if (cxlds->partition_align_bytes == 0) {
+		rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
+				 cxlds->volatile_only_bytes, "ram");
+		if (rc)
+			return rc;
+		return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
+				   cxlds->volatile_only_bytes,
+				   cxlds->persistent_only_bytes, "pmem");
+	}
+
+	rc = cxl_mem_get_partition_info(cxlds);
+	if (rc) {
+		dev_err(dev, "Failed to query partition information\n");
+		return rc;
+	}
+
+	rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0,
+			 cxlds->active_volatile_bytes, "ram");
+	if (rc)
+		return rc;
+	return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res,
+			   cxlds->active_volatile_bytes,
+			   cxlds->active_persistent_bytes, "pmem");
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL);
 
@@ -845,19 +852,11 @@ struct cxl_dev_state *cxl_dev_state_create(struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dev_state_create, CXL);
 
-static struct dentry *cxl_debugfs;
-
 void __init cxl_mbox_init(void)
 {
 	struct dentry *mbox_debugfs;
 
-	cxl_debugfs = debugfs_create_dir("cxl", NULL);
-	mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs);
+	mbox_debugfs = cxl_debugfs_create_dir("mbox");
 	debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
 			    &cxl_raw_allow_all);
 }
-
-void cxl_mbox_exit(void)
-{
-	debugfs_remove_recursive(cxl_debugfs);
-}
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index f7cdcd33504a..20ce488a7754 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -68,7 +68,7 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	unsigned long long len = range_len(&cxlds->ram_range);
+	unsigned long long len = resource_size(&cxlds->ram_res);
 
 	return sysfs_emit(buf, "%#llx\n", len);
 }
@@ -81,7 +81,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	unsigned long long len = range_len(&cxlds->pmem_range);
+	unsigned long long len = resource_size(&cxlds->pmem_res);
 
 	return sysfs_emit(buf, "%#llx\n", len);
 }
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index c4c99ff7b55e..9240df53ed87 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -4,6 +4,7 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/pci.h>
+#include <linux/pci-doe.h>
 #include <cxlpci.h>
 #include <cxlmem.h>
 #include <cxl.h>
@@ -225,7 +226,6 @@ static int dvsec_range_allowed(struct device *dev, void *arg)
 {
 	struct range *dev_range = arg;
 	struct cxl_decoder *cxld;
-	struct range root_range;
 
 	if (!is_root_decoder(dev))
 		return 0;
@@ -237,12 +237,7 @@ static int dvsec_range_allowed(struct device *dev, void *arg)
 	if (!(cxld->flags & CXL_DECODER_F_RAM))
 		return 0;
 
-	root_range = (struct range) {
-		.start = cxld->platform_res.start,
-		.end = cxld->platform_res.end,
-	};
-
-	return range_contains(&root_range, dev_range);
+	return range_contains(&cxld->hpa_range, dev_range);
 }
 
 static void disable_hdm(void *_cxlhdm)
@@ -458,3 +453,175 @@ hdm_init:
 	return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, CXL);
+
+#define CXL_DOE_TABLE_ACCESS_REQ_CODE		0x000000ff
+#define   CXL_DOE_TABLE_ACCESS_REQ_CODE_READ	0
+#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE		0x0000ff00
+#define   CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA	0
+#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE	0xffff0000
+#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY		0xffff
+#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
+
+static struct pci_doe_mb *find_cdat_doe(struct device *uport)
+{
+	struct cxl_memdev *cxlmd;
+	struct cxl_dev_state *cxlds;
+	unsigned long index;
+	void *entry;
+
+	cxlmd = to_cxl_memdev(uport);
+	cxlds = cxlmd->cxlds;
+
+	xa_for_each(&cxlds->doe_mbs, index, entry) {
+		struct pci_doe_mb *cur = entry;
+
+		if (pci_doe_supports_prot(cur, PCI_DVSEC_VENDOR_ID_CXL,
+					  CXL_DOE_PROTOCOL_TABLE_ACCESS))
+			return cur;
+	}
+
+	return NULL;
+}
+
+#define CDAT_DOE_REQ(entry_handle)					\
+	(FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE,			\
+		    CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE,			\
+		    CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) |		\
+	 FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
+
+static void cxl_doe_task_complete(struct pci_doe_task *task)
+{
+	complete(task->private);
+}
+
+struct cdat_doe_task {
+	u32 request_pl;
+	u32 response_pl[32];
+	struct completion c;
+	struct pci_doe_task task;
+};
+
+#define DECLARE_CDAT_DOE_TASK(req, cdt)                       \
+struct cdat_doe_task cdt = {                                  \
+	.c = COMPLETION_INITIALIZER_ONSTACK(cdt.c),           \
+	.request_pl = req,				      \
+	.task = {                                             \
+		.prot.vid = PCI_DVSEC_VENDOR_ID_CXL,        \
+		.prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS, \
+		.request_pl = &cdt.request_pl,                \
+		.request_pl_sz = sizeof(cdt.request_pl),      \
+		.response_pl = cdt.response_pl,               \
+		.response_pl_sz = sizeof(cdt.response_pl),    \
+		.complete = cxl_doe_task_complete,            \
+		.private = &cdt.c,                            \
+	}                                                     \
+}
+
+static int cxl_cdat_get_length(struct device *dev,
+			       struct pci_doe_mb *cdat_doe,
+			       size_t *length)
+{
+	DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(0), t);
+	int rc;
+
+	rc = pci_doe_submit_task(cdat_doe, &t.task);
+	if (rc < 0) {
+		dev_err(dev, "DOE submit failed: %d", rc);
+		return rc;
+	}
+	wait_for_completion(&t.c);
+	if (t.task.rv < sizeof(u32))
+		return -EIO;
+
+	*length = t.response_pl[1];
+	dev_dbg(dev, "CDAT length %zu\n", *length);
+
+	return 0;
+}
+
+static int cxl_cdat_read_table(struct device *dev,
+			       struct pci_doe_mb *cdat_doe,
+			       struct cxl_cdat *cdat)
+{
+	size_t length = cdat->length;
+	u32 *data = cdat->table;
+	int entry_handle = 0;
+
+	do {
+		DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t);
+		size_t entry_dw;
+		u32 *entry;
+		int rc;
+
+		rc = pci_doe_submit_task(cdat_doe, &t.task);
+		if (rc < 0) {
+			dev_err(dev, "DOE submit failed: %d", rc);
+			return rc;
+		}
+		wait_for_completion(&t.c);
+		/* 1 DW header + 1 DW data min */
+		if (t.task.rv < (2 * sizeof(u32)))
+			return -EIO;
+
+		/* Get the CXL table access header entry handle */
+		entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
+					 t.response_pl[0]);
+		entry = t.response_pl + 1;
+		entry_dw = t.task.rv / sizeof(u32);
+		/* Skip Header */
+		entry_dw -= 1;
+		entry_dw = min(length / sizeof(u32), entry_dw);
+		/* Prevent length < 1 DW from causing a buffer overflow */
+		if (entry_dw) {
+			memcpy(data, entry, entry_dw * sizeof(u32));
+			length -= entry_dw * sizeof(u32);
+			data += entry_dw;
+		}
+	} while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY);
+
+	return 0;
+}
+
+/**
+ * read_cdat_data - Read the CDAT data on this port
+ * @port: Port to read data from
+ *
+ * This call will sleep waiting for responses from the DOE mailbox.
+ */
+void read_cdat_data(struct cxl_port *port)
+{
+	struct pci_doe_mb *cdat_doe;
+	struct device *dev = &port->dev;
+	struct device *uport = port->uport;
+	size_t cdat_length;
+	int rc;
+
+	cdat_doe = find_cdat_doe(uport);
+	if (!cdat_doe) {
+		dev_dbg(dev, "No CDAT mailbox\n");
+		return;
+	}
+
+	port->cdat_available = true;
+
+	if (cxl_cdat_get_length(dev, cdat_doe, &cdat_length)) {
+		dev_dbg(dev, "No CDAT length\n");
+		return;
+	}
+
+	port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL);
+	if (!port->cdat.table)
+		return;
+
+	port->cdat.length = cdat_length;
+	rc = cxl_cdat_read_table(dev, cdat_doe, &port->cdat);
+	if (rc) {
+		/* Don't leave table data allocated on error */
+		devm_kfree(dev, port->cdat.table);
+		port->cdat.table = NULL;
+		port->cdat.length = 0;
+		dev_err(dev, "CDAT data read error\n");
+	}
+}
+EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index bec7cfb54ebf..1d12a8206444 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -62,9 +62,9 @@ static int match_nvdimm_bridge(struct device *dev, void *data)
 	return is_cxl_nvdimm_bridge(dev);
 }
 
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd)
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start)
 {
-	struct cxl_port *port = find_cxl_root(&cxl_nvd->dev);
+	struct cxl_port *port = find_cxl_root(start);
 	struct device *dev;
 
 	if (!port)
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index dbce99bdffab..bffde862de0b 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/memregion.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -42,6 +44,8 @@ static int cxl_device_id(struct device *dev)
 		return CXL_DEVICE_NVDIMM_BRIDGE;
 	if (dev->type == &cxl_nvdimm_type)
 		return CXL_DEVICE_NVDIMM;
+	if (dev->type == CXL_PMEM_REGION_TYPE())
+		return CXL_DEVICE_PMEM_REGION;
 	if (is_cxl_port(dev)) {
 		if (is_cxl_root(to_cxl_port(dev)))
 			return CXL_DEVICE_ROOT;
@@ -49,6 +53,8 @@ static int cxl_device_id(struct device *dev)
 	}
 	if (is_cxl_memdev(dev))
 		return CXL_DEVICE_MEMORY_EXPANDER;
+	if (dev->type == CXL_REGION_TYPE())
+		return CXL_DEVICE_REGION;
 	return 0;
 }
 
@@ -73,14 +79,8 @@ static ssize_t start_show(struct device *dev, struct device_attribute *attr,
 			  char *buf)
 {
 	struct cxl_decoder *cxld = to_cxl_decoder(dev);
-	u64 start;
 
-	if (is_root_decoder(dev))
-		start = cxld->platform_res.start;
-	else
-		start = cxld->decoder_range.start;
-
-	return sysfs_emit(buf, "%#llx\n", start);
+	return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start);
 }
 static DEVICE_ATTR_ADMIN_RO(start);
 
@@ -88,14 +88,8 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
 	struct cxl_decoder *cxld = to_cxl_decoder(dev);
-	u64 size;
-
-	if (is_root_decoder(dev))
-		size = resource_size(&cxld->platform_res);
-	else
-		size = range_len(&cxld->decoder_range);
 
-	return sysfs_emit(buf, "%#llx\n", size);
+	return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range));
 }
 static DEVICE_ATTR_RO(size);
 
@@ -131,20 +125,21 @@ static ssize_t target_type_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(target_type);
 
-static ssize_t emit_target_list(struct cxl_decoder *cxld, char *buf)
+static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf)
 {
+	struct cxl_decoder *cxld = &cxlsd->cxld;
 	ssize_t offset = 0;
 	int i, rc = 0;
 
 	for (i = 0; i < cxld->interleave_ways; i++) {
-		struct cxl_dport *dport = cxld->target[i];
+		struct cxl_dport *dport = cxlsd->target[i];
 		struct cxl_dport *next = NULL;
 
 		if (!dport)
 			break;
 
 		if (i + 1 < cxld->interleave_ways)
-			next = cxld->target[i + 1];
+			next = cxlsd->target[i + 1];
 		rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id,
 				   next ? "," : "");
 		if (rc < 0)
@@ -158,15 +153,15 @@ static ssize_t emit_target_list(struct cxl_decoder *cxld, char *buf)
 static ssize_t target_list_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+	struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
 	ssize_t offset;
 	unsigned int seq;
 	int rc;
 
 	do {
-		seq = read_seqbegin(&cxld->target_lock);
-		rc = emit_target_list(cxld, buf);
-	} while (read_seqretry(&cxld->target_lock, seq));
+		seq = read_seqbegin(&cxlsd->target_lock);
+		rc = emit_target_list(cxlsd, buf);
+	} while (read_seqretry(&cxlsd->target_lock, seq));
 
 	if (rc < 0)
 		return rc;
@@ -180,10 +175,121 @@ static ssize_t target_list_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(target_list);
 
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+	switch (cxled->mode) {
+	case CXL_DECODER_RAM:
+		return sysfs_emit(buf, "ram\n");
+	case CXL_DECODER_PMEM:
+		return sysfs_emit(buf, "pmem\n");
+	case CXL_DECODER_NONE:
+		return sysfs_emit(buf, "none\n");
+	case CXL_DECODER_MIXED:
+	default:
+		return sysfs_emit(buf, "mixed\n");
+	}
+}
+
+static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	enum cxl_decoder_mode mode;
+	ssize_t rc;
+
+	if (sysfs_streq(buf, "pmem"))
+		mode = CXL_DECODER_PMEM;
+	else if (sysfs_streq(buf, "ram"))
+		mode = CXL_DECODER_RAM;
+	else
+		return -EINVAL;
+
+	rc = cxl_dpa_set_mode(cxled, mode);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	u64 base = cxl_dpa_resource_start(cxled);
+
+	return sysfs_emit(buf, "%#llx\n", base);
+}
+static DEVICE_ATTR_RO(dpa_resource);
+
+static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	resource_size_t size = cxl_dpa_size(cxled);
+
+	return sysfs_emit(buf, "%pa\n", &size);
+}
+
+static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+	unsigned long long size;
+	ssize_t rc;
+
+	rc = kstrtoull(buf, 0, &size);
+	if (rc)
+		return rc;
+
+	if (!IS_ALIGNED(size, SZ_256M))
+		return -EINVAL;
+
+	rc = cxl_dpa_free(cxled);
+	if (rc)
+		return rc;
+
+	if (size == 0)
+		return len;
+
+	rc = cxl_dpa_alloc(cxled, size);
+	if (rc)
+		return rc;
+
+	return len;
+}
+static DEVICE_ATTR_RW(dpa_size);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%d\n", cxld->interleave_granularity);
+}
+
+static DEVICE_ATTR_RO(interleave_granularity);
+
+static ssize_t interleave_ways_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+	return sysfs_emit(buf, "%d\n", cxld->interleave_ways);
+}
+
+static DEVICE_ATTR_RO(interleave_ways);
+
 static struct attribute *cxl_decoder_base_attrs[] = {
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
 	&dev_attr_locked.attr,
+	&dev_attr_interleave_granularity.attr,
+	&dev_attr_interleave_ways.attr,
 	NULL,
 };
 
@@ -197,11 +303,35 @@ static struct attribute *cxl_decoder_root_attrs[] = {
 	&dev_attr_cap_type2.attr,
 	&dev_attr_cap_type3.attr,
 	&dev_attr_target_list.attr,
+	SET_CXL_REGION_ATTR(create_pmem_region)
+	SET_CXL_REGION_ATTR(delete_region)
 	NULL,
 };
 
+static bool can_create_pmem(struct cxl_root_decoder *cxlrd)
+{
+	unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM;
+
+	return (cxlrd->cxlsd.cxld.flags & flags) == flags;
+}
+
+static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+	if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd))
+		return 0;
+
+	if (a == CXL_REGION_ATTR(delete_region) && !can_create_pmem(cxlrd))
+		return 0;
+
+	return a->mode;
+}
+
 static struct attribute_group cxl_decoder_root_attribute_group = {
 	.attrs = cxl_decoder_root_attrs,
+	.is_visible = cxl_root_decoder_visible,
 };
 
 static const struct attribute_group *cxl_decoder_root_attribute_groups[] = {
@@ -214,6 +344,7 @@ static const struct attribute_group *cxl_decoder_root_attribute_groups[] = {
 static struct attribute *cxl_decoder_switch_attrs[] = {
 	&dev_attr_target_type.attr,
 	&dev_attr_target_list.attr,
+	SET_CXL_REGION_ATTR(region)
 	NULL,
 };
 
@@ -230,6 +361,10 @@ static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = {
 
 static struct attribute *cxl_decoder_endpoint_attrs[] = {
 	&dev_attr_target_type.attr,
+	&dev_attr_mode.attr,
+	&dev_attr_dpa_size.attr,
+	&dev_attr_dpa_resource.attr,
+	SET_CXL_REGION_ATTR(region)
 	NULL,
 };
 
@@ -244,31 +379,64 @@ static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = {
 	NULL,
 };
 
-static void cxl_decoder_release(struct device *dev)
+static void __cxl_decoder_release(struct cxl_decoder *cxld)
 {
-	struct cxl_decoder *cxld = to_cxl_decoder(dev);
-	struct cxl_port *port = to_cxl_port(dev->parent);
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
 
 	ida_free(&port->decoder_ida, cxld->id);
-	kfree(cxld);
 	put_device(&port->dev);
 }
 
+static void cxl_endpoint_decoder_release(struct device *dev)
+{
+	struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+	__cxl_decoder_release(&cxled->cxld);
+	kfree(cxled);
+}
+
+static void cxl_switch_decoder_release(struct device *dev)
+{
+	struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+	__cxl_decoder_release(&cxlsd->cxld);
+	kfree(cxlsd);
+}
+
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_root_decoder(dev),
+			  "not a cxl_root_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, CXL);
+
+static void cxl_root_decoder_release(struct device *dev)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+	if (atomic_read(&cxlrd->region_id) >= 0)
+		memregion_free(atomic_read(&cxlrd->region_id));
+	__cxl_decoder_release(&cxlrd->cxlsd.cxld);
+	kfree(cxlrd);
+}
+
 static const struct device_type cxl_decoder_endpoint_type = {
 	.name = "cxl_decoder_endpoint",
-	.release = cxl_decoder_release,
+	.release = cxl_endpoint_decoder_release,
 	.groups = cxl_decoder_endpoint_attribute_groups,
 };
 
 static const struct device_type cxl_decoder_switch_type = {
 	.name = "cxl_decoder_switch",
-	.release = cxl_decoder_release,
+	.release = cxl_switch_decoder_release,
 	.groups = cxl_decoder_switch_attribute_groups,
 };
 
 static const struct device_type cxl_decoder_root_type = {
 	.name = "cxl_decoder_root",
-	.release = cxl_decoder_release,
+	.release = cxl_root_decoder_release,
 	.groups = cxl_decoder_root_attribute_groups,
 };
 
@@ -283,39 +451,63 @@ bool is_root_decoder(struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL);
 
-bool is_cxl_decoder(struct device *dev)
+bool is_switch_decoder(struct device *dev)
 {
-	return dev->type && dev->type->release == cxl_decoder_release;
+	return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type;
 }
-EXPORT_SYMBOL_NS_GPL(is_cxl_decoder, CXL);
 
 struct cxl_decoder *to_cxl_decoder(struct device *dev)
 {
-	if (dev_WARN_ONCE(dev, dev->type->release != cxl_decoder_release,
+	if (dev_WARN_ONCE(dev,
+			  !is_switch_decoder(dev) && !is_endpoint_decoder(dev),
 			  "not a cxl_decoder device\n"))
 		return NULL;
 	return container_of(dev, struct cxl_decoder, dev);
 }
 EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL);
 
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev),
+			  "not a cxl_endpoint_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_endpoint_decoder, cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, CXL);
+
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_switch_decoder(dev),
+			  "not a cxl_switch_decoder device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_switch_decoder, cxld.dev);
+}
+
 static void cxl_ep_release(struct cxl_ep *ep)
 {
-	if (!ep)
-		return;
-	list_del(&ep->list);
 	put_device(ep->ep);
 	kfree(ep);
 }
 
+static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep)
+{
+	if (!ep)
+		return;
+	xa_erase(&port->endpoints, (unsigned long) ep->ep);
+	cxl_ep_release(ep);
+}
+
 static void cxl_port_release(struct device *dev)
 {
 	struct cxl_port *port = to_cxl_port(dev);
-	struct cxl_ep *ep, *_e;
+	unsigned long index;
+	struct cxl_ep *ep;
 
-	device_lock(dev);
-	list_for_each_entry_safe(ep, _e, &port->endpoints, list)
-		cxl_ep_release(ep);
-	device_unlock(dev);
+	xa_for_each(&port->endpoints, index, ep)
+		cxl_ep_remove(port, ep);
+	xa_destroy(&port->endpoints);
+	xa_destroy(&port->dports);
+	xa_destroy(&port->regions);
 	ida_free(&cxl_port_ida, port->id);
 	kfree(port);
 }
@@ -370,7 +562,7 @@ static void unregister_port(void *_port)
 		lock_dev = &parent->dev;
 
 	device_lock_assert(lock_dev);
-	port->uport = NULL;
+	port->dead = true;
 	device_unregister(&port->dev);
 }
 
@@ -395,7 +587,7 @@ static struct lock_class_key cxl_port_key;
 
 static struct cxl_port *cxl_port_alloc(struct device *uport,
 				       resource_size_t component_reg_phys,
-				       struct cxl_port *parent_port)
+				       struct cxl_dport *parent_dport)
 {
 	struct cxl_port *port;
 	struct device *dev;
@@ -409,6 +601,7 @@ static struct cxl_port *cxl_port_alloc(struct device *uport,
 	if (rc < 0)
 		goto err;
 	port->id = rc;
+	port->uport = uport;
 
 	/*
 	 * The top-level cxl_port "cxl_root" does not have a cxl_port as
@@ -417,17 +610,37 @@ static struct cxl_port *cxl_port_alloc(struct device *uport,
 	 * description.
 	 */
 	dev = &port->dev;
-	if (parent_port) {
+	if (parent_dport) {
+		struct cxl_port *parent_port = parent_dport->port;
+		struct cxl_port *iter;
+
 		dev->parent = &parent_port->dev;
 		port->depth = parent_port->depth + 1;
+		port->parent_dport = parent_dport;
+
+		/*
+		 * walk to the host bridge, or the first ancestor that knows
+		 * the host bridge
+		 */
+		iter = port;
+		while (!iter->host_bridge &&
+		       !is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+		if (iter->host_bridge)
+			port->host_bridge = iter->host_bridge;
+		else
+			port->host_bridge = iter->uport;
+		dev_dbg(uport, "host-bridge: %s\n", dev_name(port->host_bridge));
 	} else
 		dev->parent = uport;
 
-	port->uport = uport;
 	port->component_reg_phys = component_reg_phys;
 	ida_init(&port->decoder_ida);
-	INIT_LIST_HEAD(&port->dports);
-	INIT_LIST_HEAD(&port->endpoints);
+	port->hdm_end = -1;
+	port->commit_end = -1;
+	xa_init(&port->dports);
+	xa_init(&port->endpoints);
+	xa_init(&port->regions);
 
 	device_initialize(dev);
 	lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth);
@@ -447,24 +660,24 @@ err:
  * @host: host device for devm operations
  * @uport: "physical" device implementing this upstream port
  * @component_reg_phys: (optional) for configurable cxl_port instances
- * @parent_port: next hop up in the CXL memory decode hierarchy
+ * @parent_dport: next hop up in the CXL memory decode hierarchy
  */
 struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport,
 				   resource_size_t component_reg_phys,
-				   struct cxl_port *parent_port)
+				   struct cxl_dport *parent_dport)
 {
 	struct cxl_port *port;
 	struct device *dev;
 	int rc;
 
-	port = cxl_port_alloc(uport, component_reg_phys, parent_port);
+	port = cxl_port_alloc(uport, component_reg_phys, parent_dport);
 	if (IS_ERR(port))
 		return port;
 
 	dev = &port->dev;
 	if (is_cxl_memdev(uport))
 		rc = dev_set_name(dev, "endpoint%d", port->id);
-	else if (parent_port)
+	else if (parent_dport)
 		rc = dev_set_name(dev, "port%d", port->id);
 	else
 		rc = dev_set_name(dev, "root%d", port->id);
@@ -556,17 +769,13 @@ static int match_root_child(struct device *dev, const void *match)
 		return 0;
 
 	port = to_cxl_port(dev);
-	device_lock(dev);
-	list_for_each_entry(dport, &port->dports, list) {
-		iter = match;
-		while (iter) {
-			if (iter == dport->dport)
-				goto out;
-			iter = iter->parent;
-		}
+	iter = match;
+	while (iter) {
+		dport = cxl_find_dport_by_dev(port, iter);
+		if (dport)
+			break;
+		iter = iter->parent;
 	}
-out:
-	device_unlock(dev);
 
 	return !!iter;
 }
@@ -590,9 +799,10 @@ EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL);
 static struct cxl_dport *find_dport(struct cxl_port *port, int id)
 {
 	struct cxl_dport *dport;
+	unsigned long index;
 
 	device_lock_assert(&port->dev);
-	list_for_each_entry (dport, &port->dports, list)
+	xa_for_each(&port->dports, index, dport)
 		if (dport->port_id == id)
 			return dport;
 	return NULL;
@@ -604,15 +814,15 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new)
 
 	device_lock_assert(&port->dev);
 	dup = find_dport(port, new->port_id);
-	if (dup)
+	if (dup) {
 		dev_err(&port->dev,
 			"unable to add dport%d-%s non-unique port id (%s)\n",
 			new->port_id, dev_name(new->dport),
 			dev_name(dup->dport));
-	else
-		list_add_tail(&new->list, &port->dports);
-
-	return dup ? -EEXIST : 0;
+		return -EBUSY;
+	}
+	return xa_insert(&port->dports, (unsigned long)new->dport, new,
+			 GFP_KERNEL);
 }
 
 /*
@@ -639,10 +849,8 @@ static void cxl_dport_remove(void *data)
 	struct cxl_dport *dport = data;
 	struct cxl_port *port = dport->port;
 
+	xa_erase(&port->dports, (unsigned long) dport->dport);
 	put_device(dport->dport);
-	cond_cxl_root_lock(port);
-	list_del(&dport->list);
-	cond_cxl_root_unlock(port);
 }
 
 static void cxl_dport_unlink(void *data)
@@ -694,7 +902,6 @@ struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
 	if (!dport)
 		return ERR_PTR(-ENOMEM);
 
-	INIT_LIST_HEAD(&dport->list);
 	dport->dport = dport_dev;
 	dport->port_id = port_id;
 	dport->component_reg_phys = component_reg_phys;
@@ -723,44 +930,33 @@ struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL);
 
-static struct cxl_ep *find_ep(struct cxl_port *port, struct device *ep_dev)
-{
-	struct cxl_ep *ep;
-
-	device_lock_assert(&port->dev);
-	list_for_each_entry(ep, &port->endpoints, list)
-		if (ep->ep == ep_dev)
-			return ep;
-	return NULL;
-}
-
-static int add_ep(struct cxl_port *port, struct cxl_ep *new)
+static int add_ep(struct cxl_ep *new)
 {
-	struct cxl_ep *dup;
+	struct cxl_port *port = new->dport->port;
+	int rc;
 
 	device_lock(&port->dev);
 	if (port->dead) {
 		device_unlock(&port->dev);
 		return -ENXIO;
 	}
-	dup = find_ep(port, new->ep);
-	if (!dup)
-		list_add_tail(&new->list, &port->endpoints);
+	rc = xa_insert(&port->endpoints, (unsigned long)new->ep, new,
+		       GFP_KERNEL);
 	device_unlock(&port->dev);
 
-	return dup ? -EEXIST : 0;
+	return rc;
 }
 
 /**
  * cxl_add_ep - register an endpoint's interest in a port
- * @port: a port in the endpoint's topology ancestry
+ * @dport: the dport that routes to @ep_dev
  * @ep_dev: device representing the endpoint
  *
  * Intermediate CXL ports are scanned based on the arrival of endpoints.
  * When those endpoints depart the port can be destroyed once all
  * endpoints that care about that port have been removed.
  */
-static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev)
+static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev)
 {
 	struct cxl_ep *ep;
 	int rc;
@@ -769,10 +965,10 @@ static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev)
 	if (!ep)
 		return -ENOMEM;
 
-	INIT_LIST_HEAD(&ep->list);
 	ep->ep = get_device(ep_dev);
+	ep->dport = dport;
 
-	rc = add_ep(port, ep);
+	rc = add_ep(ep);
 	if (rc)
 		cxl_ep_release(ep);
 	return rc;
@@ -781,11 +977,13 @@ static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev)
 struct cxl_find_port_ctx {
 	const struct device *dport_dev;
 	const struct cxl_port *parent_port;
+	struct cxl_dport **dport;
 };
 
 static int match_port_by_dport(struct device *dev, const void *data)
 {
 	const struct cxl_find_port_ctx *ctx = data;
+	struct cxl_dport *dport;
 	struct cxl_port *port;
 
 	if (!is_cxl_port(dev))
@@ -794,7 +992,10 @@ static int match_port_by_dport(struct device *dev, const void *data)
 		return 0;
 
 	port = to_cxl_port(dev);
-	return cxl_find_dport_by_dev(port, ctx->dport_dev) != NULL;
+	dport = cxl_find_dport_by_dev(port, ctx->dport_dev);
+	if (ctx->dport)
+		*ctx->dport = dport;
+	return dport != NULL;
 }
 
 static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
@@ -810,24 +1011,32 @@ static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
 	return NULL;
 }
 
-static struct cxl_port *find_cxl_port(struct device *dport_dev)
+static struct cxl_port *find_cxl_port(struct device *dport_dev,
+				      struct cxl_dport **dport)
 {
 	struct cxl_find_port_ctx ctx = {
 		.dport_dev = dport_dev,
+		.dport = dport,
 	};
+	struct cxl_port *port;
 
-	return __find_cxl_port(&ctx);
+	port = __find_cxl_port(&ctx);
+	return port;
 }
 
 static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port,
-					 struct device *dport_dev)
+					 struct device *dport_dev,
+					 struct cxl_dport **dport)
 {
 	struct cxl_find_port_ctx ctx = {
 		.dport_dev = dport_dev,
 		.parent_port = parent_port,
+		.dport = dport,
 	};
+	struct cxl_port *port;
 
-	return __find_cxl_port(&ctx);
+	port = __find_cxl_port(&ctx);
+	return port;
 }
 
 /*
@@ -851,13 +1060,13 @@ static void delete_endpoint(void *data)
 	struct cxl_port *parent_port;
 	struct device *parent;
 
-	parent_port = cxl_mem_find_port(cxlmd);
+	parent_port = cxl_mem_find_port(cxlmd, NULL);
 	if (!parent_port)
 		goto out;
 	parent = &parent_port->dev;
 
 	device_lock(parent);
-	if (parent->driver && endpoint->uport) {
+	if (parent->driver && !endpoint->dead) {
 		devm_release_action(parent, cxl_unlink_uport, endpoint);
 		devm_release_action(parent, unregister_port, endpoint);
 	}
@@ -883,21 +1092,70 @@ EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, CXL);
  * for a port to be unregistered is when all memdevs beneath that port have gone
  * through ->remove(). This "bottom-up" removal selectively removes individual
  * child ports manually. This depends on devm_cxl_add_port() to not change is
- * devm action registration order.
+ * devm action registration order, and for dports to have already been
+ * destroyed by reap_dports().
  */
-static void delete_switch_port(struct cxl_port *port, struct list_head *dports)
+static void delete_switch_port(struct cxl_port *port)
+{
+	devm_release_action(port->dev.parent, cxl_unlink_uport, port);
+	devm_release_action(port->dev.parent, unregister_port, port);
+}
+
+static void reap_dports(struct cxl_port *port)
 {
-	struct cxl_dport *dport, *_d;
+	struct cxl_dport *dport;
+	unsigned long index;
+
+	device_lock_assert(&port->dev);
 
-	list_for_each_entry_safe(dport, _d, dports, list) {
+	xa_for_each(&port->dports, index, dport) {
 		devm_release_action(&port->dev, cxl_dport_unlink, dport);
 		devm_release_action(&port->dev, cxl_dport_remove, dport);
 		devm_kfree(&port->dev, dport);
 	}
-	devm_release_action(port->dev.parent, cxl_unlink_uport, port);
-	devm_release_action(port->dev.parent, unregister_port, port);
 }
 
+int devm_cxl_add_endpoint(struct cxl_memdev *cxlmd,
+			  struct cxl_dport *parent_dport)
+{
+	struct cxl_port *parent_port = parent_dport->port;
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_port *endpoint, *iter, *down;
+	int rc;
+
+	/*
+	 * Now that the path to the root is established record all the
+	 * intervening ports in the chain.
+	 */
+	for (iter = parent_port, down = NULL; !is_cxl_root(iter);
+	     down = iter, iter = to_cxl_port(iter->dev.parent)) {
+		struct cxl_ep *ep;
+
+		ep = cxl_ep_load(iter, cxlmd);
+		ep->next = down;
+	}
+
+	endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev,
+				     cxlds->component_reg_phys, parent_dport);
+	if (IS_ERR(endpoint))
+		return PTR_ERR(endpoint);
+
+	dev_dbg(&cxlmd->dev, "add: %s\n", dev_name(&endpoint->dev));
+
+	rc = cxl_endpoint_autoremove(cxlmd, endpoint);
+	if (rc)
+		return rc;
+
+	if (!endpoint->dev.driver) {
+		dev_err(&cxlmd->dev, "%s failed probe\n",
+			dev_name(&endpoint->dev));
+		return -ENXIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_endpoint, CXL);
+
 static void cxl_detach_ep(void *data)
 {
 	struct cxl_memdev *cxlmd = data;
@@ -906,13 +1164,13 @@ static void cxl_detach_ep(void *data)
 	for (iter = &cxlmd->dev; iter; iter = grandparent(iter)) {
 		struct device *dport_dev = grandparent(iter);
 		struct cxl_port *port, *parent_port;
-		LIST_HEAD(reap_dports);
 		struct cxl_ep *ep;
+		bool died = false;
 
 		if (!dport_dev)
 			break;
 
-		port = find_cxl_port(dport_dev);
+		port = find_cxl_port(dport_dev, NULL);
 		if (!port)
 			continue;
 
@@ -936,26 +1194,27 @@ static void cxl_detach_ep(void *data)
 		}
 
 		device_lock(&port->dev);
-		ep = find_ep(port, &cxlmd->dev);
+		ep = cxl_ep_load(port, cxlmd);
 		dev_dbg(&cxlmd->dev, "disconnect %s from %s\n",
 			ep ? dev_name(ep->ep) : "", dev_name(&port->dev));
-		cxl_ep_release(ep);
-		if (ep && !port->dead && list_empty(&port->endpoints) &&
+		cxl_ep_remove(port, ep);
+		if (ep && !port->dead && xa_empty(&port->endpoints) &&
 		    !is_cxl_root(parent_port)) {
 			/*
 			 * This was the last ep attached to a dynamically
 			 * enumerated port. Block new cxl_add_ep() and garbage
 			 * collect the port.
 			 */
+			died = true;
 			port->dead = true;
-			list_splice_init(&port->dports, &reap_dports);
+			reap_dports(port);
 		}
 		device_unlock(&port->dev);
 
-		if (!list_empty(&reap_dports)) {
+		if (died) {
 			dev_dbg(&cxlmd->dev, "delete %s\n",
 				dev_name(&port->dev));
-			delete_switch_port(port, &reap_dports);
+			delete_switch_port(port);
 		}
 		put_device(&port->dev);
 		device_unlock(&parent_port->dev);
@@ -986,6 +1245,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd,
 {
 	struct device *dparent = grandparent(dport_dev);
 	struct cxl_port *port, *parent_port = NULL;
+	struct cxl_dport *dport, *parent_dport;
 	resource_size_t component_reg_phys;
 	int rc;
 
@@ -1000,7 +1260,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd,
 		return -ENXIO;
 	}
 
-	parent_port = find_cxl_port(dparent);
+	parent_port = find_cxl_port(dparent, &parent_dport);
 	if (!parent_port) {
 		/* iterate to create this parent_port */
 		return -EAGAIN;
@@ -1015,13 +1275,14 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd,
 		goto out;
 	}
 
-	port = find_cxl_port_at(parent_port, dport_dev);
+	port = find_cxl_port_at(parent_port, dport_dev, &dport);
 	if (!port) {
 		component_reg_phys = find_component_registers(uport_dev);
 		port = devm_cxl_add_port(&parent_port->dev, uport_dev,
-					 component_reg_phys, parent_port);
+					 component_reg_phys, parent_dport);
+		/* retry find to pick up the new dport information */
 		if (!IS_ERR(port))
-			get_device(&port->dev);
+			port = find_cxl_port_at(parent_port, dport_dev, &dport);
 	}
 out:
 	device_unlock(&parent_port->dev);
@@ -1031,8 +1292,8 @@ out:
 	else {
 		dev_dbg(&cxlmd->dev, "add to new port %s:%s\n",
 			dev_name(&port->dev), dev_name(port->uport));
-		rc = cxl_add_ep(port, &cxlmd->dev);
-		if (rc == -EEXIST) {
+		rc = cxl_add_ep(dport, &cxlmd->dev);
+		if (rc == -EBUSY) {
 			/*
 			 * "can't" happen, but this error code means
 			 * something to the caller, so translate it.
@@ -1065,6 +1326,7 @@ retry:
 	for (iter = dev; iter; iter = grandparent(iter)) {
 		struct device *dport_dev = grandparent(iter);
 		struct device *uport_dev;
+		struct cxl_dport *dport;
 		struct cxl_port *port;
 
 		if (!dport_dev)
@@ -1080,12 +1342,12 @@ retry:
 		dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n",
 			dev_name(iter), dev_name(dport_dev),
 			dev_name(uport_dev));
-		port = find_cxl_port(dport_dev);
+		port = find_cxl_port(dport_dev, &dport);
 		if (port) {
 			dev_dbg(&cxlmd->dev,
 				"found already registered port %s:%s\n",
 				dev_name(&port->dev), dev_name(port->uport));
-			rc = cxl_add_ep(port, &cxlmd->dev);
+			rc = cxl_add_ep(dport, &cxlmd->dev);
 
 			/*
 			 * If the endpoint already exists in the port's list,
@@ -1094,7 +1356,7 @@ retry:
 			 * the parent_port lock as the current port may be being
 			 * reaped.
 			 */
-			if (rc && rc != -EEXIST) {
+			if (rc && rc != -EBUSY) {
 				put_device(&port->dev);
 				return rc;
 			}
@@ -1124,30 +1386,14 @@ retry:
 }
 EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL);
 
-struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd)
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+				   struct cxl_dport **dport)
 {
-	return find_cxl_port(grandparent(&cxlmd->dev));
+	return find_cxl_port(grandparent(&cxlmd->dev), dport);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL);
 
-struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port,
-					const struct device *dev)
-{
-	struct cxl_dport *dport;
-
-	device_lock(&port->dev);
-	list_for_each_entry(dport, &port->dports, list)
-		if (dport->dport == dev) {
-			device_unlock(&port->dev);
-			return dport;
-		}
-
-	device_unlock(&port->dev);
-	return NULL;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_find_dport_by_dev, CXL);
-
-static int decoder_populate_targets(struct cxl_decoder *cxld,
+static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd,
 				    struct cxl_port *port, int *target_map)
 {
 	int i, rc = 0;
@@ -1157,88 +1403,92 @@ static int decoder_populate_targets(struct cxl_decoder *cxld,
 
 	device_lock_assert(&port->dev);
 
-	if (list_empty(&port->dports))
+	if (xa_empty(&port->dports))
 		return -EINVAL;
 
-	write_seqlock(&cxld->target_lock);
-	for (i = 0; i < cxld->nr_targets; i++) {
+	write_seqlock(&cxlsd->target_lock);
+	for (i = 0; i < cxlsd->nr_targets; i++) {
 		struct cxl_dport *dport = find_dport(port, target_map[i]);
 
 		if (!dport) {
 			rc = -ENXIO;
 			break;
 		}
-		cxld->target[i] = dport;
+		cxlsd->target[i] = dport;
 	}
-	write_sequnlock(&cxld->target_lock);
+	write_sequnlock(&cxlsd->target_lock);
 
 	return rc;
 }
 
+static struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos)
+{
+	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+	struct cxl_decoder *cxld = &cxlsd->cxld;
+	int iw;
+
+	iw = cxld->interleave_ways;
+	if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets,
+			  "misconfigured root decoder\n"))
+		return NULL;
+
+	return cxlrd->cxlsd.target[pos % iw];
+}
+
 static struct lock_class_key cxl_decoder_key;
 
 /**
- * cxl_decoder_alloc - Allocate a new CXL decoder
+ * cxl_decoder_init - Common decoder setup / initialization
  * @port: owning port of this decoder
- * @nr_targets: downstream targets accessible by this decoder. All upstream
- *		ports and root ports must have at least 1 target. Endpoint
- *		devices will have 0 targets. Callers wishing to register an
- *		endpoint device should specify 0.
- *
- * A port should contain one or more decoders. Each of those decoders enable
- * some address space for CXL.mem utilization. A decoder is expected to be
- * configured by the caller before registering.
+ * @cxld: common decoder properties to initialize
  *
- * Return: A new cxl decoder to be registered by cxl_decoder_add(). The decoder
- *	   is initialized to be a "passthrough" decoder.
+ * A port may contain one or more decoders. Each of those decoders
+ * enable some address space for CXL.mem utilization. A decoder is
+ * expected to be configured by the caller before registering via
+ * cxl_decoder_add()
  */
-static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port,
-					     unsigned int nr_targets)
+static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld)
 {
-	struct cxl_decoder *cxld;
 	struct device *dev;
-	int rc = 0;
-
-	if (nr_targets > CXL_DECODER_MAX_INTERLEAVE)
-		return ERR_PTR(-EINVAL);
-
-	cxld = kzalloc(struct_size(cxld, target, nr_targets), GFP_KERNEL);
-	if (!cxld)
-		return ERR_PTR(-ENOMEM);
+	int rc;
 
 	rc = ida_alloc(&port->decoder_ida, GFP_KERNEL);
 	if (rc < 0)
-		goto err;
+		return rc;
 
 	/* need parent to stick around to release the id */
 	get_device(&port->dev);
 	cxld->id = rc;
 
-	cxld->nr_targets = nr_targets;
-	seqlock_init(&cxld->target_lock);
 	dev = &cxld->dev;
 	device_initialize(dev);
 	lockdep_set_class(&dev->mutex, &cxl_decoder_key);
 	device_set_pm_not_required(dev);
 	dev->parent = &port->dev;
 	dev->bus = &cxl_bus_type;
-	if (is_cxl_root(port))
-		cxld->dev.type = &cxl_decoder_root_type;
-	else if (is_cxl_endpoint(port))
-		cxld->dev.type = &cxl_decoder_endpoint_type;
-	else
-		cxld->dev.type = &cxl_decoder_switch_type;
 
 	/* Pre initialize an "empty" decoder */
 	cxld->interleave_ways = 1;
 	cxld->interleave_granularity = PAGE_SIZE;
 	cxld->target_type = CXL_DECODER_EXPANDER;
-	cxld->platform_res = (struct resource)DEFINE_RES_MEM(0, 0);
+	cxld->hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
 
-	return cxld;
-err:
-	kfree(cxld);
-	return ERR_PTR(rc);
+	return 0;
+}
+
+static int cxl_switch_decoder_init(struct cxl_port *port,
+				   struct cxl_switch_decoder *cxlsd,
+				   int nr_targets)
+{
+	if (nr_targets > CXL_DECODER_MAX_INTERLEAVE)
+		return -EINVAL;
+
+	cxlsd->nr_targets = nr_targets;
+	seqlock_init(&cxlsd->target_lock);
+	return cxl_decoder_init(port, &cxlsd->cxld);
 }
 
 /**
@@ -1251,13 +1501,46 @@ err:
  * firmware description of CXL resources into a CXL standard decode
  * topology.
  */
-struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
-					   unsigned int nr_targets)
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+						unsigned int nr_targets)
 {
+	struct cxl_root_decoder *cxlrd;
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_decoder *cxld;
+	int rc;
+
 	if (!is_cxl_root(port))
 		return ERR_PTR(-EINVAL);
 
-	return cxl_decoder_alloc(port, nr_targets);
+	cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets),
+			GFP_KERNEL);
+	if (!cxlrd)
+		return ERR_PTR(-ENOMEM);
+
+	cxlsd = &cxlrd->cxlsd;
+	rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+	if (rc) {
+		kfree(cxlrd);
+		return ERR_PTR(rc);
+	}
+
+	cxlrd->calc_hb = cxl_hb_modulo;
+
+	cxld = &cxlsd->cxld;
+	cxld->dev.type = &cxl_decoder_root_type;
+	/*
+	 * cxl_root_decoder_release() special cases negative ids to
+	 * detect memregion_alloc() failures.
+	 */
+	atomic_set(&cxlrd->region_id, -1);
+	rc = memregion_alloc(GFP_KERNEL);
+	if (rc < 0) {
+		put_device(&cxld->dev);
+		return ERR_PTR(rc);
+	}
+
+	atomic_set(&cxlrd->region_id, rc);
+	return cxlrd;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL);
 
@@ -1272,13 +1555,29 @@ EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL);
  * that sit between Switch Upstream Ports / Switch Downstream Ports and
  * Host Bridges / Root Ports.
  */
-struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
-					     unsigned int nr_targets)
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+						    unsigned int nr_targets)
 {
+	struct cxl_switch_decoder *cxlsd;
+	struct cxl_decoder *cxld;
+	int rc;
+
 	if (is_cxl_root(port) || is_cxl_endpoint(port))
 		return ERR_PTR(-EINVAL);
 
-	return cxl_decoder_alloc(port, nr_targets);
+	cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL);
+	if (!cxlsd)
+		return ERR_PTR(-ENOMEM);
+
+	rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+	if (rc) {
+		kfree(cxlsd);
+		return ERR_PTR(rc);
+	}
+
+	cxld = &cxlsd->cxld;
+	cxld->dev.type = &cxl_decoder_switch_type;
+	return cxlsd;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL);
 
@@ -1288,18 +1587,35 @@ EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL);
  *
  * Return: A new cxl decoder to be registered by cxl_decoder_add()
  */
-struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
 {
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_decoder *cxld;
+	int rc;
+
 	if (!is_cxl_endpoint(port))
 		return ERR_PTR(-EINVAL);
 
-	return cxl_decoder_alloc(port, 0);
+	cxled = kzalloc(sizeof(*cxled), GFP_KERNEL);
+	if (!cxled)
+		return ERR_PTR(-ENOMEM);
+
+	cxled->pos = -1;
+	cxld = &cxled->cxld;
+	rc = cxl_decoder_init(port, cxld);
+	if (rc)	 {
+		kfree(cxled);
+		return ERR_PTR(rc);
+	}
+
+	cxld->dev.type = &cxl_decoder_endpoint_type;
+	return cxled;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, CXL);
 
 /**
  * cxl_decoder_add_locked - Add a decoder with targets
- * @cxld: The cxl decoder allocated by cxl_decoder_alloc()
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
  * @target_map: A list of downstream ports that this decoder can direct memory
  *              traffic to. These numbers should correspond with the port number
  *              in the PCIe Link Capabilities structure.
@@ -1335,7 +1651,9 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map)
 
 	port = to_cxl_port(cxld->dev.parent);
 	if (!is_endpoint_decoder(dev)) {
-		rc = decoder_populate_targets(cxld, port, target_map);
+		struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+		rc = decoder_populate_targets(cxlsd, port, target_map);
 		if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) {
 			dev_err(&port->dev,
 				"Failed to populate active decoder targets\n");
@@ -1347,20 +1665,13 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map)
 	if (rc)
 		return rc;
 
-	/*
-	 * Platform decoder resources should show up with a reasonable name. All
-	 * other resources are just sub ranges within the main decoder resource.
-	 */
-	if (is_root_decoder(dev))
-		cxld->platform_res.name = dev_name(dev);
-
 	return device_add(dev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, CXL);
 
 /**
  * cxl_decoder_add - Add a decoder with targets
- * @cxld: The cxl decoder allocated by cxl_decoder_alloc()
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
  * @target_map: A list of downstream ports that this decoder can direct memory
  *              traffic to. These numbers should correspond with the port number
  *              in the PCIe Link Capabilities structure.
@@ -1394,6 +1705,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL);
 
 static void cxld_unregister(void *dev)
 {
+	struct cxl_endpoint_decoder *cxled;
+
+	if (is_endpoint_decoder(dev)) {
+		cxled = to_cxl_endpoint_decoder(dev);
+		cxl_decoder_kill_region(cxled);
+	}
+
 	device_unregister(dev);
 }
 
@@ -1521,10 +1839,20 @@ struct bus_type cxl_bus_type = {
 };
 EXPORT_SYMBOL_NS_GPL(cxl_bus_type, CXL);
 
+static struct dentry *cxl_debugfs;
+
+struct dentry *cxl_debugfs_create_dir(const char *dir)
+{
+	return debugfs_create_dir(dir, cxl_debugfs);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, CXL);
+
 static __init int cxl_core_init(void)
 {
 	int rc;
 
+	cxl_debugfs = debugfs_create_dir("cxl", NULL);
+
 	cxl_mbox_init();
 
 	rc = cxl_memdev_init();
@@ -1541,22 +1869,28 @@ static __init int cxl_core_init(void)
 	if (rc)
 		goto err_bus;
 
+	rc = cxl_region_init();
+	if (rc)
+		goto err_region;
+
 	return 0;
 
+err_region:
+	bus_unregister(&cxl_bus_type);
 err_bus:
 	destroy_workqueue(cxl_bus_wq);
 err_wq:
 	cxl_memdev_exit();
-	cxl_mbox_exit();
 	return rc;
 }
 
 static void cxl_core_exit(void)
 {
+	cxl_region_exit();
 	bus_unregister(&cxl_bus_type);
 	destroy_workqueue(cxl_bus_wq);
 	cxl_memdev_exit();
-	cxl_mbox_exit();
+	debugfs_remove_recursive(cxl_debugfs);
 }
 
 module_init(cxl_core_init);
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
new file mode 100644
index 000000000000..401148016978
--- /dev/null
+++ b/drivers/cxl/core/region.c
@@ -0,0 +1,1896 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/memregion.h>
+#include <linux/genalloc.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl core region
+ *
+ * CXL Regions represent mapped memory capacity in system physical address
+ * space. Whereas the CXL Root Decoders identify the bounds of potential CXL
+ * Memory ranges, Regions represent the active mapped capacity by the HDM
+ * Decoder Capability structures throughout the Host Bridges, Switches, and
+ * Endpoints in the topology.
+ *
+ * Region configuration has ordering constraints. UUID may be set at any time
+ * but is only visible for persistent regions.
+ * 1. Interleave granularity
+ * 2. Interleave size
+ * 3. Decoder targets
+ */
+
+/*
+ * All changes to the interleave configuration occur with this lock held
+ * for write.
+ */
+static DECLARE_RWSEM(cxl_region_rwsem);
+
+static struct cxl_region *to_cxl_region(struct device *dev);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static int is_dup(struct device *match, void *data)
+{
+	struct cxl_region_params *p;
+	struct cxl_region *cxlr;
+	uuid_t *uuid = data;
+
+	if (!is_cxl_region(match))
+		return 0;
+
+	lockdep_assert_held(&cxl_region_rwsem);
+	cxlr = to_cxl_region(match);
+	p = &cxlr->params;
+
+	if (uuid_equal(&p->uuid, uuid)) {
+		dev_dbg(match, "already has uuid: %pUb\n", uuid);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	uuid_t temp;
+	ssize_t rc;
+
+	if (len != UUID_STRING_LEN + 1)
+		return -EINVAL;
+
+	rc = uuid_parse(buf, &temp);
+	if (rc)
+		return rc;
+
+	if (uuid_is_null(&temp))
+		return -EINVAL;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (uuid_equal(&p->uuid, &temp))
+		goto out;
+
+	rc = -EBUSY;
+	if (p->state >= CXL_CONFIG_ACTIVE)
+		goto out;
+
+	rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
+	if (rc < 0)
+		goto out;
+
+	uuid_copy(&p->uuid, &temp);
+out:
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
+					  struct cxl_region *cxlr)
+{
+	return xa_load(&port->regions, (unsigned long)cxlr);
+}
+
+static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int i;
+
+	for (i = count - 1; i >= 0; i--) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_port *iter = cxled_to_port(cxled);
+		struct cxl_ep *ep;
+		int rc;
+
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+			struct cxl_region_ref *cxl_rr;
+			struct cxl_decoder *cxld;
+
+			cxl_rr = cxl_rr_load(iter, cxlr);
+			cxld = cxl_rr->decoder;
+			rc = cxld->reset(cxld);
+			if (rc)
+				return rc;
+		}
+
+		rc = cxled->cxld.reset(&cxled->cxld);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+static int cxl_region_decode_commit(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int i, rc = 0;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_region_ref *cxl_rr;
+		struct cxl_decoder *cxld;
+		struct cxl_port *iter;
+		struct cxl_ep *ep;
+
+		/* commit bottom up */
+		for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+		     iter = to_cxl_port(iter->dev.parent)) {
+			cxl_rr = cxl_rr_load(iter, cxlr);
+			cxld = cxl_rr->decoder;
+			rc = cxld->commit(cxld);
+			if (rc)
+				break;
+		}
+
+		if (rc) {
+			/* programming @iter failed, teardown */
+			for (ep = cxl_ep_load(iter, cxlmd); ep && iter;
+			     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+				cxl_rr = cxl_rr_load(iter, cxlr);
+				cxld = cxl_rr->decoder;
+				cxld->reset(cxld);
+			}
+
+			cxled->cxld.reset(&cxled->cxld);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	/* undo the targets that were successfully committed */
+	cxl_region_decode_reset(cxlr, i);
+	return rc;
+}
+
+static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	bool commit;
+	ssize_t rc;
+
+	rc = kstrtobool(buf, &commit);
+	if (rc)
+		return rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	/* Already in the requested state? */
+	if (commit && p->state >= CXL_CONFIG_COMMIT)
+		goto out;
+	if (!commit && p->state < CXL_CONFIG_COMMIT)
+		goto out;
+
+	/* Not ready to commit? */
+	if (commit && p->state < CXL_CONFIG_ACTIVE) {
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (commit)
+		rc = cxl_region_decode_commit(cxlr);
+	else {
+		p->state = CXL_CONFIG_RESET_PENDING;
+		up_write(&cxl_region_rwsem);
+		device_release_driver(&cxlr->dev);
+		down_write(&cxl_region_rwsem);
+
+		/*
+		 * The lock was dropped, so need to revalidate that the reset is
+		 * still pending.
+		 */
+		if (p->state == CXL_CONFIG_RESET_PENDING)
+			rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
+	}
+
+	if (rc)
+		goto out;
+
+	if (commit)
+		p->state = CXL_CONFIG_COMMIT;
+	else if (p->state == CXL_CONFIG_RESET_PENDING)
+		p->state = CXL_CONFIG_ACTIVE;
+
+out:
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+	return len;
+}
+
+static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(commit);
+
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+				  int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_region *cxlr = to_cxl_region(dev);
+
+	if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM)
+		return 0;
+	return a->mode;
+}
+
+static ssize_t interleave_ways_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static const struct attribute_group *get_cxl_region_target_group(void);
+
+static ssize_t interleave_ways_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	unsigned int val, save;
+	int rc;
+	u8 iw;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = ways_to_cxl(val, &iw);
+	if (rc)
+		return rc;
+
+	/*
+	 * Even for x3, x9, and x12 interleaves the region interleave must be a
+	 * power of 2 multiple of the host bridge interleave.
+	 */
+	if (!is_power_of_2(val / cxld->interleave_ways) ||
+	    (val % cxld->interleave_ways)) {
+		dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
+		return -EINVAL;
+	}
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	save = p->interleave_ways;
+	p->interleave_ways = val;
+	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+	if (rc)
+		p->interleave_ways = save;
+out:
+	up_write(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_ways);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static ssize_t interleave_granularity_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc, val;
+	u16 ig;
+
+	rc = kstrtoint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = granularity_to_cxl(val, &ig);
+	if (rc)
+		return rc;
+
+	/*
+	 * When the host-bridge is interleaved, disallow region granularity !=
+	 * root granularity. Regions with a granularity less than the root
+	 * interleave result in needing multiple endpoints to support a single
+	 * slot in the interleave (possible to suport in the future). Regions
+	 * with a granularity greater than the root interleave result in invalid
+	 * DPA translations (invalid to support).
+	 */
+	if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
+		return -EINVAL;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	p->interleave_granularity = val;
+out:
+	up_write(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_granularity);
+
+static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	u64 resource = -1ULL;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->res)
+		resource = p->res->start;
+	rc = sysfs_emit(buf, "%#llx\n", resource);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(resource);
+
+static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	struct cxl_region_params *p = &cxlr->params;
+	struct resource *res;
+	u32 remainder = 0;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	/* Nothing to do... */
+	if (p->res && resource_size(p->res) == size)
+		return 0;
+
+	/* To change size the old size must be freed first */
+	if (p->res)
+		return -EBUSY;
+
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+		return -EBUSY;
+
+	/* ways, granularity and uuid (if PMEM) need to be set before HPA */
+	if (!p->interleave_ways || !p->interleave_granularity ||
+	    (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid)))
+		return -ENXIO;
+
+	div_u64_rem(size, SZ_256M * p->interleave_ways, &remainder);
+	if (remainder)
+		return -EINVAL;
+
+	res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
+				    dev_name(&cxlr->dev));
+	if (IS_ERR(res)) {
+		dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n",
+			PTR_ERR(res));
+		return PTR_ERR(res);
+	}
+
+	p->res = res;
+	p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+	return 0;
+}
+
+static void cxl_region_iomem_release(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+
+	if (device_is_registered(&cxlr->dev))
+		lockdep_assert_held_write(&cxl_region_rwsem);
+	if (p->res) {
+		remove_resource(p->res);
+		kfree(p->res);
+		p->res = NULL;
+	}
+}
+
+static int free_hpa(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	if (!p->res)
+		return 0;
+
+	if (p->state >= CXL_CONFIG_ACTIVE)
+		return -EBUSY;
+
+	cxl_region_iomem_release(cxlr);
+	p->state = CXL_CONFIG_IDLE;
+	return 0;
+}
+
+static ssize_t size_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t len)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	u64 val;
+	int rc;
+
+	rc = kstrtou64(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (val)
+		rc = alloc_hpa(cxlr, val);
+	else
+		rc = free_hpa(cxlr);
+	up_write(&cxl_region_rwsem);
+
+	if (rc)
+		return rc;
+
+	return len;
+}
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	u64 size = 0;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->res)
+		size = resource_size(p->res);
+	rc = sysfs_emit(buf, "%#llx\n", size);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+static DEVICE_ATTR_RW(size);
+
+static struct attribute *cxl_region_attrs[] = {
+	&dev_attr_uuid.attr,
+	&dev_attr_commit.attr,
+	&dev_attr_interleave_ways.attr,
+	&dev_attr_interleave_granularity.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group cxl_region_group = {
+	.attrs = cxl_region_attrs,
+	.is_visible = cxl_region_visible,
+};
+
+static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	int rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	cxled = p->targets[pos];
+	if (!cxled)
+		rc = sysfs_emit(buf, "\n");
+	else
+		rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
+out:
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static int match_free_decoder(struct device *dev, void *data)
+{
+	struct cxl_decoder *cxld;
+	int *id = data;
+
+	if (!is_switch_decoder(dev))
+		return 0;
+
+	cxld = to_cxl_decoder(dev);
+
+	/* enforce ordered allocation */
+	if (cxld->id != *id)
+		return 0;
+
+	if (!cxld->region)
+		return 1;
+
+	(*id)++;
+
+	return 0;
+}
+
+static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port,
+						   struct cxl_region *cxlr)
+{
+	struct device *dev;
+	int id = 0;
+
+	dev = device_find_child(&port->dev, &id, match_free_decoder);
+	if (!dev)
+		return NULL;
+	/*
+	 * This decoder is pinned registered as long as the endpoint decoder is
+	 * registered, and endpoint decoder unregistration holds the
+	 * cxl_region_rwsem over unregister events, so no need to hold on to
+	 * this extra reference.
+	 */
+	put_device(dev);
+	return to_cxl_decoder(dev);
+}
+
+static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
+					       struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_region_ref *cxl_rr, *iter;
+	unsigned long index;
+	int rc;
+
+	xa_for_each(&port->regions, index, iter) {
+		struct cxl_region_params *ip = &iter->region->params;
+
+		if (ip->res->start > p->res->start) {
+			dev_dbg(&cxlr->dev,
+				"%s: HPA order violation %s:%pr vs %pr\n",
+				dev_name(&port->dev),
+				dev_name(&iter->region->dev), ip->res, p->res);
+			return ERR_PTR(-EBUSY);
+		}
+	}
+
+	cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL);
+	if (!cxl_rr)
+		return ERR_PTR(-ENOMEM);
+	cxl_rr->port = port;
+	cxl_rr->region = cxlr;
+	cxl_rr->nr_targets = 1;
+	xa_init(&cxl_rr->endpoints);
+
+	rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL);
+	if (rc) {
+		dev_dbg(&cxlr->dev,
+			"%s: failed to track region reference: %d\n",
+			dev_name(&port->dev), rc);
+		kfree(cxl_rr);
+		return ERR_PTR(rc);
+	}
+
+	return cxl_rr;
+}
+
+static void free_region_ref(struct cxl_region_ref *cxl_rr)
+{
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+
+	dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
+	if (cxld->region == cxlr) {
+		cxld->region = NULL;
+		put_device(&cxlr->dev);
+	}
+
+	xa_erase(&port->regions, (unsigned long)cxlr);
+	xa_destroy(&cxl_rr->endpoints);
+	kfree(cxl_rr);
+}
+
+static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
+			 struct cxl_endpoint_decoder *cxled)
+{
+	int rc;
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+	struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled));
+
+	if (ep) {
+		rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep,
+			       GFP_KERNEL);
+		if (rc)
+			return rc;
+	}
+	cxl_rr->nr_eps++;
+
+	if (!cxld->region) {
+		cxld->region = cxlr;
+		get_device(&cxlr->dev);
+	}
+
+	return 0;
+}
+
+/**
+ * cxl_port_attach_region() - track a region's interest in a port by endpoint
+ * @port: port to add a new region reference 'struct cxl_region_ref'
+ * @cxlr: region to attach to @port
+ * @cxled: endpoint decoder used to create or further pin a region reference
+ * @pos: interleave position of @cxled in @cxlr
+ *
+ * The attach event is an opportunity to validate CXL decode setup
+ * constraints and record metadata needed for programming HDM decoders,
+ * in particular decoder target lists.
+ *
+ * The steps are:
+ *
+ * - validate that there are no other regions with a higher HPA already
+ *   associated with @port
+ * - establish a region reference if one is not already present
+ *
+ *   - additionally allocate a decoder instance that will host @cxlr on
+ *     @port
+ *
+ * - pin the region reference by the endpoint
+ * - account for how many entries in @port's target list are needed to
+ *   cover all of the added endpoints.
+ */
+static int cxl_port_attach_region(struct cxl_port *port,
+				  struct cxl_region *cxlr,
+				  struct cxl_endpoint_decoder *cxled, int pos)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+	struct cxl_region_ref *cxl_rr;
+	bool nr_targets_inc = false;
+	struct cxl_decoder *cxld;
+	unsigned long index;
+	int rc = -EBUSY;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	cxl_rr = cxl_rr_load(port, cxlr);
+	if (cxl_rr) {
+		struct cxl_ep *ep_iter;
+		int found = 0;
+
+		/*
+		 * Walk the existing endpoints that have been attached to
+		 * @cxlr at @port and see if they share the same 'next' port
+		 * in the downstream direction. I.e. endpoints that share common
+		 * upstream switch.
+		 */
+		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+			if (ep_iter == ep)
+				continue;
+			if (ep_iter->next == ep->next) {
+				found++;
+				break;
+			}
+		}
+
+		/*
+		 * New target port, or @port is an endpoint port that always
+		 * accounts its own local decode as a target.
+		 */
+		if (!found || !ep->next) {
+			cxl_rr->nr_targets++;
+			nr_targets_inc = true;
+		}
+
+		/*
+		 * The decoder for @cxlr was allocated when the region was first
+		 * attached to @port.
+		 */
+		cxld = cxl_rr->decoder;
+	} else {
+		cxl_rr = alloc_region_ref(port, cxlr);
+		if (IS_ERR(cxl_rr)) {
+			dev_dbg(&cxlr->dev,
+				"%s: failed to allocate region reference\n",
+				dev_name(&port->dev));
+			return PTR_ERR(cxl_rr);
+		}
+		nr_targets_inc = true;
+
+		if (port == cxled_to_port(cxled))
+			cxld = &cxled->cxld;
+		else
+			cxld = cxl_region_find_decoder(port, cxlr);
+		if (!cxld) {
+			dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+				dev_name(&port->dev));
+			goto out_erase;
+		}
+
+		if (cxld->region) {
+			dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
+				dev_name(&port->dev), dev_name(&cxld->dev),
+				dev_name(&cxld->region->dev));
+			rc = -EBUSY;
+			goto out_erase;
+		}
+
+		cxl_rr->decoder = cxld;
+	}
+
+	rc = cxl_rr_ep_add(cxl_rr, cxled);
+	if (rc) {
+		dev_dbg(&cxlr->dev,
+			"%s: failed to track endpoint %s:%s reference\n",
+			dev_name(&port->dev), dev_name(&cxlmd->dev),
+			dev_name(&cxld->dev));
+		goto out_erase;
+	}
+
+	dev_dbg(&cxlr->dev,
+		"%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
+		dev_name(port->uport), dev_name(&port->dev),
+		dev_name(&cxld->dev), dev_name(&cxlmd->dev),
+		dev_name(&cxled->cxld.dev), pos,
+		ep ? ep->next ? dev_name(ep->next->uport) :
+				      dev_name(&cxlmd->dev) :
+			   "none",
+		cxl_rr->nr_eps, cxl_rr->nr_targets);
+
+	return 0;
+out_erase:
+	if (nr_targets_inc)
+		cxl_rr->nr_targets--;
+	if (cxl_rr->nr_eps == 0)
+		free_region_ref(cxl_rr);
+	return rc;
+}
+
+static void cxl_port_detach_region(struct cxl_port *port,
+				   struct cxl_region *cxlr,
+				   struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_region_ref *cxl_rr;
+	struct cxl_ep *ep = NULL;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	cxl_rr = cxl_rr_load(port, cxlr);
+	if (!cxl_rr)
+		return;
+
+	/*
+	 * Endpoint ports do not carry cxl_ep references, and they
+	 * never target more than one endpoint by definition
+	 */
+	if (cxl_rr->decoder == &cxled->cxld)
+		cxl_rr->nr_eps--;
+	else
+		ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled);
+	if (ep) {
+		struct cxl_ep *ep_iter;
+		unsigned long index;
+		int found = 0;
+
+		cxl_rr->nr_eps--;
+		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+			if (ep_iter->next == ep->next) {
+				found++;
+				break;
+			}
+		}
+		if (!found)
+			cxl_rr->nr_targets--;
+	}
+
+	if (cxl_rr->nr_eps == 0)
+		free_region_ref(cxl_rr);
+}
+
+static int check_last_peer(struct cxl_endpoint_decoder *cxled,
+			   struct cxl_ep *ep, struct cxl_region_ref *cxl_rr,
+			   int distance)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_region *cxlr = cxl_rr->region;
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled_peer;
+	struct cxl_port *port = cxl_rr->port;
+	struct cxl_memdev *cxlmd_peer;
+	struct cxl_ep *ep_peer;
+	int pos = cxled->pos;
+
+	/*
+	 * If this position wants to share a dport with the last endpoint mapped
+	 * then that endpoint, at index 'position - distance', must also be
+	 * mapped by this dport.
+	 */
+	if (pos < distance) {
+		dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
+			dev_name(port->uport), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+		return -ENXIO;
+	}
+	cxled_peer = p->targets[pos - distance];
+	cxlmd_peer = cxled_to_memdev(cxled_peer);
+	ep_peer = cxl_ep_load(port, cxlmd_peer);
+	if (ep->dport != ep_peer->dport) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
+			dev_name(port->uport), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
+			dev_name(&cxlmd_peer->dev),
+			dev_name(&cxled_peer->cxld.dev));
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static int cxl_port_setup_targets(struct cxl_port *port,
+				  struct cxl_region *cxlr,
+				  struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
+	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_decoder *cxld = cxl_rr->decoder;
+	struct cxl_switch_decoder *cxlsd;
+	u16 eig, peig;
+	u8 eiw, peiw;
+
+	/*
+	 * While root level decoders support x3, x6, x12, switch level
+	 * decoders only support powers of 2 up to x16.
+	 */
+	if (!is_power_of_2(cxl_rr->nr_targets)) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
+			dev_name(port->uport), dev_name(&port->dev),
+			cxl_rr->nr_targets);
+		return -EINVAL;
+	}
+
+	cxlsd = to_cxl_switch_decoder(&cxld->dev);
+	if (cxl_rr->nr_targets_set) {
+		int i, distance;
+
+		distance = p->nr_targets / cxl_rr->nr_targets;
+		for (i = 0; i < cxl_rr->nr_targets_set; i++)
+			if (ep->dport == cxlsd->target[i]) {
+				rc = check_last_peer(cxled, ep, cxl_rr,
+						     distance);
+				if (rc)
+					return rc;
+				goto out_target_set;
+			}
+		goto add_target;
+	}
+
+	if (is_cxl_root(parent_port)) {
+		parent_ig = cxlrd->cxlsd.cxld.interleave_granularity;
+		parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
+		/*
+		 * For purposes of address bit routing, use power-of-2 math for
+		 * switch ports.
+		 */
+		if (!is_power_of_2(parent_iw))
+			parent_iw /= 3;
+	} else {
+		struct cxl_region_ref *parent_rr;
+		struct cxl_decoder *parent_cxld;
+
+		parent_rr = cxl_rr_load(parent_port, cxlr);
+		parent_cxld = parent_rr->decoder;
+		parent_ig = parent_cxld->interleave_granularity;
+		parent_iw = parent_cxld->interleave_ways;
+	}
+
+	rc = granularity_to_cxl(parent_ig, &peig);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
+			dev_name(parent_port->uport),
+			dev_name(&parent_port->dev), parent_ig);
+		return rc;
+	}
+
+	rc = ways_to_cxl(parent_iw, &peiw);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
+			dev_name(parent_port->uport),
+			dev_name(&parent_port->dev), parent_iw);
+		return rc;
+	}
+
+	iw = cxl_rr->nr_targets;
+	rc = ways_to_cxl(iw, &eiw);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
+			dev_name(port->uport), dev_name(&port->dev), iw);
+		return rc;
+	}
+
+	/*
+	 * If @parent_port is masking address bits, pick the next unused address
+	 * bit to route @port's targets.
+	 */
+	if (parent_iw > 1 && cxl_rr->nr_targets > 1) {
+		u32 address_bit = max(peig + peiw, eiw + peig);
+
+		eig = address_bit - eiw + 1;
+	} else {
+		eiw = peiw;
+		eig = peig;
+	}
+
+	rc = cxl_to_granularity(eig, &ig);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
+			dev_name(port->uport), dev_name(&port->dev),
+			256 << eig);
+		return rc;
+	}
+
+	cxld->interleave_ways = iw;
+	cxld->interleave_granularity = ig;
+	cxld->hpa_range = (struct range) {
+		.start = p->res->start,
+		.end = p->res->end,
+	};
+	dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport),
+		dev_name(&port->dev), iw, ig);
+add_target:
+	if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: targets full trying to add %s:%s at %d\n",
+			dev_name(port->uport), dev_name(&port->dev),
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+		return -ENXIO;
+	}
+	cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
+	inc = 1;
+out_target_set:
+	cxl_rr->nr_targets_set += inc;
+	dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
+		dev_name(port->uport), dev_name(&port->dev),
+		cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport),
+		dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+
+	return 0;
+}
+
+static void cxl_port_reset_targets(struct cxl_port *port,
+				   struct cxl_region *cxlr)
+{
+	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+	struct cxl_decoder *cxld;
+
+	/*
+	 * After the last endpoint has been detached the entire cxl_rr may now
+	 * be gone.
+	 */
+	if (!cxl_rr)
+		return;
+	cxl_rr->nr_targets_set = 0;
+
+	cxld = cxl_rr->decoder;
+	cxld->hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
+}
+
+static void cxl_region_teardown_targets(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_memdev *cxlmd;
+	struct cxl_port *iter;
+	struct cxl_ep *ep;
+	int i;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		cxled = p->targets[i];
+		cxlmd = cxled_to_memdev(cxled);
+
+		iter = cxled_to_port(cxled);
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd))
+			cxl_port_reset_targets(iter, cxlr);
+	}
+}
+
+static int cxl_region_setup_targets(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_endpoint_decoder *cxled;
+	struct cxl_memdev *cxlmd;
+	struct cxl_port *iter;
+	struct cxl_ep *ep;
+	int i, rc;
+
+	for (i = 0; i < p->nr_targets; i++) {
+		cxled = p->targets[i];
+		cxlmd = cxled_to_memdev(cxled);
+
+		iter = cxled_to_port(cxled);
+		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+			iter = to_cxl_port(iter->dev.parent);
+
+		/*
+		 * Descend the topology tree programming targets while
+		 * looking for conflicts.
+		 */
+		for (ep = cxl_ep_load(iter, cxlmd); iter;
+		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+			rc = cxl_port_setup_targets(iter, cxlr, cxled);
+			if (rc) {
+				cxl_region_teardown_targets(cxlr);
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int cxl_region_attach(struct cxl_region *cxlr,
+			     struct cxl_endpoint_decoder *cxled, int pos)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	struct cxl_port *ep_port, *root_port, *iter;
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_dport *dport;
+	int i, rc = -ENXIO;
+
+	if (cxled->mode == CXL_DECODER_DEAD) {
+		dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
+		return -ENODEV;
+	}
+
+	/* all full of members, or interleave config not established? */
+	if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		dev_dbg(&cxlr->dev, "region already active\n");
+		return -EBUSY;
+	} else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		dev_dbg(&cxlr->dev, "interleave config missing\n");
+		return -ENXIO;
+	}
+
+	if (pos < 0 || pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		return -ENXIO;
+	}
+
+	if (p->targets[pos] == cxled)
+		return 0;
+
+	if (p->targets[pos]) {
+		struct cxl_endpoint_decoder *cxled_target = p->targets[pos];
+		struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target);
+
+		dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n",
+			pos, dev_name(&cxlmd_target->dev),
+			dev_name(&cxled_target->cxld.dev));
+		return -EBUSY;
+	}
+
+	for (i = 0; i < p->interleave_ways; i++) {
+		struct cxl_endpoint_decoder *cxled_target;
+		struct cxl_memdev *cxlmd_target;
+
+		cxled_target = p->targets[pos];
+		if (!cxled_target)
+			continue;
+
+		cxlmd_target = cxled_to_memdev(cxled_target);
+		if (cxlmd_target == cxlmd) {
+			dev_dbg(&cxlr->dev,
+				"%s already specified at position %d via: %s\n",
+				dev_name(&cxlmd->dev), pos,
+				dev_name(&cxled_target->cxld.dev));
+			return -EBUSY;
+		}
+	}
+
+	ep_port = cxled_to_port(cxled);
+	root_port = cxlrd_to_port(cxlrd);
+	dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
+	if (!dport) {
+		dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			dev_name(cxlr->dev.parent));
+		return -ENXIO;
+	}
+
+	if (cxlrd->calc_hb(cxlrd, pos) != dport) {
+		dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			dev_name(&cxlrd->cxlsd.cxld.dev));
+		return -ENXIO;
+	}
+
+	if (cxled->cxld.target_type != cxlr->type) {
+		dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			cxled->cxld.target_type, cxlr->type);
+		return -ENXIO;
+	}
+
+	if (!cxled->dpa_res) {
+		dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev));
+		return -ENXIO;
+	}
+
+	if (resource_size(cxled->dpa_res) * p->interleave_ways !=
+	    resource_size(p->res)) {
+		dev_dbg(&cxlr->dev,
+			"%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n",
+			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			(u64)resource_size(cxled->dpa_res), p->interleave_ways,
+			(u64)resource_size(p->res));
+		return -EINVAL;
+	}
+
+	for (iter = ep_port; !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent)) {
+		rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
+		if (rc)
+			goto err;
+	}
+
+	p->targets[pos] = cxled;
+	cxled->pos = pos;
+	p->nr_targets++;
+
+	if (p->nr_targets == p->interleave_ways) {
+		rc = cxl_region_setup_targets(cxlr);
+		if (rc)
+			goto err_decrement;
+		p->state = CXL_CONFIG_ACTIVE;
+	}
+
+	cxled->cxld.interleave_ways = p->interleave_ways;
+	cxled->cxld.interleave_granularity = p->interleave_granularity;
+	cxled->cxld.hpa_range = (struct range) {
+		.start = p->res->start,
+		.end = p->res->end,
+	};
+
+	return 0;
+
+err_decrement:
+	p->nr_targets--;
+err:
+	for (iter = ep_port; !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent))
+		cxl_port_detach_region(iter, cxlr, cxled);
+	return rc;
+}
+
+static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *iter, *ep_port = cxled_to_port(cxled);
+	struct cxl_region *cxlr = cxled->cxld.region;
+	struct cxl_region_params *p;
+	int rc = 0;
+
+	lockdep_assert_held_write(&cxl_region_rwsem);
+
+	if (!cxlr)
+		return 0;
+
+	p = &cxlr->params;
+	get_device(&cxlr->dev);
+
+	if (p->state > CXL_CONFIG_ACTIVE) {
+		/*
+		 * TODO: tear down all impacted regions if a device is
+		 * removed out of order
+		 */
+		rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
+		if (rc)
+			goto out;
+		p->state = CXL_CONFIG_ACTIVE;
+	}
+
+	for (iter = ep_port; !is_cxl_root(iter);
+	     iter = to_cxl_port(iter->dev.parent))
+		cxl_port_detach_region(iter, cxlr, cxled);
+
+	if (cxled->pos < 0 || cxled->pos >= p->interleave_ways ||
+	    p->targets[cxled->pos] != cxled) {
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+		dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
+			      dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+			      cxled->pos);
+		goto out;
+	}
+
+	if (p->state == CXL_CONFIG_ACTIVE) {
+		p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+		cxl_region_teardown_targets(cxlr);
+	}
+	p->targets[cxled->pos] = NULL;
+	p->nr_targets--;
+	cxled->cxld.hpa_range = (struct range) {
+		.start = 0,
+		.end = -1,
+	};
+
+	/* notify the region driver that one of its targets has departed */
+	up_write(&cxl_region_rwsem);
+	device_release_driver(&cxlr->dev);
+	down_write(&cxl_region_rwsem);
+out:
+	put_device(&cxlr->dev);
+	return rc;
+}
+
+void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
+{
+	down_write(&cxl_region_rwsem);
+	cxled->mode = CXL_DECODER_DEAD;
+	cxl_region_detach(cxled);
+	up_write(&cxl_region_rwsem);
+}
+
+static int attach_target(struct cxl_region *cxlr, const char *decoder, int pos)
+{
+	struct device *dev;
+	int rc;
+
+	dev = bus_find_device_by_name(&cxl_bus_type, NULL, decoder);
+	if (!dev)
+		return -ENODEV;
+
+	if (!is_endpoint_decoder(dev)) {
+		put_device(dev);
+		return -EINVAL;
+	}
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		goto out;
+	down_read(&cxl_dpa_rwsem);
+	rc = cxl_region_attach(cxlr, to_cxl_endpoint_decoder(dev), pos);
+	up_read(&cxl_dpa_rwsem);
+	up_write(&cxl_region_rwsem);
+out:
+	put_device(dev);
+	return rc;
+}
+
+static int detach_target(struct cxl_region *cxlr, int pos)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	int rc;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (pos >= p->interleave_ways) {
+		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+			p->interleave_ways);
+		rc = -ENXIO;
+		goto out;
+	}
+
+	if (!p->targets[pos]) {
+		rc = 0;
+		goto out;
+	}
+
+	rc = cxl_region_detach(p->targets[pos]);
+out:
+	up_write(&cxl_region_rwsem);
+	return rc;
+}
+
+static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
+			    size_t len)
+{
+	int rc;
+
+	if (sysfs_streq(buf, "\n"))
+		rc = detach_target(cxlr, pos);
+	else
+		rc = attach_target(cxlr, buf, pos);
+
+	if (rc < 0)
+		return rc;
+	return len;
+}
+
+#define TARGET_ATTR_RW(n)                                              \
+static ssize_t target##n##_show(                                       \
+	struct device *dev, struct device_attribute *attr, char *buf)  \
+{                                                                      \
+	return show_targetN(to_cxl_region(dev), buf, (n));             \
+}                                                                      \
+static ssize_t target##n##_store(struct device *dev,                   \
+				 struct device_attribute *attr,        \
+				 const char *buf, size_t len)          \
+{                                                                      \
+	return store_targetN(to_cxl_region(dev), buf, (n), len);       \
+}                                                                      \
+static DEVICE_ATTR_RW(target##n)
+
+TARGET_ATTR_RW(0);
+TARGET_ATTR_RW(1);
+TARGET_ATTR_RW(2);
+TARGET_ATTR_RW(3);
+TARGET_ATTR_RW(4);
+TARGET_ATTR_RW(5);
+TARGET_ATTR_RW(6);
+TARGET_ATTR_RW(7);
+TARGET_ATTR_RW(8);
+TARGET_ATTR_RW(9);
+TARGET_ATTR_RW(10);
+TARGET_ATTR_RW(11);
+TARGET_ATTR_RW(12);
+TARGET_ATTR_RW(13);
+TARGET_ATTR_RW(14);
+TARGET_ATTR_RW(15);
+
+static struct attribute *target_attrs[] = {
+	&dev_attr_target0.attr,
+	&dev_attr_target1.attr,
+	&dev_attr_target2.attr,
+	&dev_attr_target3.attr,
+	&dev_attr_target4.attr,
+	&dev_attr_target5.attr,
+	&dev_attr_target6.attr,
+	&dev_attr_target7.attr,
+	&dev_attr_target8.attr,
+	&dev_attr_target9.attr,
+	&dev_attr_target10.attr,
+	&dev_attr_target11.attr,
+	&dev_attr_target12.attr,
+	&dev_attr_target13.attr,
+	&dev_attr_target14.attr,
+	&dev_attr_target15.attr,
+	NULL,
+};
+
+static umode_t cxl_region_target_visible(struct kobject *kobj,
+					 struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+
+	if (n < p->interleave_ways)
+		return a->mode;
+	return 0;
+}
+
+static const struct attribute_group cxl_region_target_group = {
+	.attrs = target_attrs,
+	.is_visible = cxl_region_target_visible,
+};
+
+static const struct attribute_group *get_cxl_region_target_group(void)
+{
+	return &cxl_region_target_group;
+}
+
+static const struct attribute_group *region_groups[] = {
+	&cxl_base_attribute_group,
+	&cxl_region_group,
+	&cxl_region_target_group,
+	NULL,
+};
+
+static void cxl_region_release(struct device *dev)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+
+	memregion_free(cxlr->id);
+	kfree(cxlr);
+}
+
+const struct device_type cxl_region_type = {
+	.name = "cxl_region",
+	.release = cxl_region_release,
+	.groups = region_groups
+};
+
+bool is_cxl_region(struct device *dev)
+{
+	return dev->type == &cxl_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL);
+
+static struct cxl_region *to_cxl_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type,
+			  "not a cxl_region device\n"))
+		return NULL;
+
+	return container_of(dev, struct cxl_region, dev);
+}
+
+static void unregister_region(void *dev)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+
+	device_del(dev);
+	cxl_region_iomem_release(cxlr);
+	put_device(dev);
+}
+
+static struct lock_class_key cxl_region_key;
+
+static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id)
+{
+	struct cxl_region *cxlr;
+	struct device *dev;
+
+	cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL);
+	if (!cxlr) {
+		memregion_free(id);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	dev = &cxlr->dev;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_region_key);
+	dev->parent = &cxlrd->cxlsd.cxld.dev;
+	device_set_pm_not_required(dev);
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_region_type;
+	cxlr->id = id;
+
+	return cxlr;
+}
+
+/**
+ * devm_cxl_add_region - Adds a region to a decoder
+ * @cxlrd: root decoder
+ * @id: memregion id to create, or memregion_free() on failure
+ * @mode: mode for the endpoint decoders of this region
+ * @type: select whether this is an expander or accelerator (type-2 or type-3)
+ *
+ * This is the second step of region initialization. Regions exist within an
+ * address space which is mapped by a @cxlrd.
+ *
+ * Return: 0 if the region was added to the @cxlrd, else returns negative error
+ * code. The region will be named "regionZ" where Z is the unique region number.
+ */
+static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
+					      int id,
+					      enum cxl_decoder_mode mode,
+					      enum cxl_decoder_type type)
+{
+	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+	struct cxl_region *cxlr;
+	struct device *dev;
+	int rc;
+
+	cxlr = cxl_region_alloc(cxlrd, id);
+	if (IS_ERR(cxlr))
+		return cxlr;
+	cxlr->mode = mode;
+	cxlr->type = type;
+
+	dev = &cxlr->dev;
+	rc = dev_set_name(dev, "region%d", id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(port->uport, unregister_region, cxlr);
+	if (rc)
+		return ERR_PTR(rc);
+
+	dev_dbg(port->uport, "%s: created %s\n",
+		dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
+	return cxlr;
+
+err:
+	put_device(dev);
+	return ERR_PTR(rc);
+}
+
+static ssize_t create_pmem_region_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+	return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
+}
+
+static ssize_t create_pmem_region_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_region *cxlr;
+	int id, rc;
+
+	rc = sscanf(buf, "region%d\n", &id);
+	if (rc != 1)
+		return -EINVAL;
+
+	rc = memregion_alloc(GFP_KERNEL);
+	if (rc < 0)
+		return rc;
+
+	if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
+		memregion_free(rc);
+		return -EBUSY;
+	}
+
+	cxlr = devm_cxl_add_region(cxlrd, id, CXL_DECODER_PMEM,
+				   CXL_DECODER_EXPANDER);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	return len;
+}
+DEVICE_ATTR_RW(create_pmem_region);
+
+static ssize_t region_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct cxl_decoder *cxld = to_cxl_decoder(dev);
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+
+	if (cxld->region)
+		rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
+	else
+		rc = sysfs_emit(buf, "\n");
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+DEVICE_ATTR_RO(region);
+
+static struct cxl_region *
+cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
+{
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct device *region_dev;
+
+	region_dev = device_find_child_by_name(&cxld->dev, name);
+	if (!region_dev)
+		return ERR_PTR(-ENODEV);
+
+	return to_cxl_region(region_dev);
+}
+
+static ssize_t delete_region_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+	struct cxl_port *port = to_cxl_port(dev->parent);
+	struct cxl_region *cxlr;
+
+	cxlr = cxl_find_region_by_name(cxlrd, buf);
+	if (IS_ERR(cxlr))
+		return PTR_ERR(cxlr);
+
+	devm_release_action(port->uport, unregister_region, cxlr);
+	put_device(&cxlr->dev);
+
+	return len;
+}
+DEVICE_ATTR_WO(delete_region);
+
+static void cxl_pmem_region_release(struct device *dev)
+{
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	int i;
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd;
+
+		put_device(&cxlmd->dev);
+	}
+
+	kfree(cxlr_pmem);
+}
+
+static const struct attribute_group *cxl_pmem_region_attribute_groups[] = {
+	&cxl_base_attribute_group,
+	NULL,
+};
+
+const struct device_type cxl_pmem_region_type = {
+	.name = "cxl_pmem_region",
+	.release = cxl_pmem_region_release,
+	.groups = cxl_pmem_region_attribute_groups,
+};
+
+bool is_cxl_pmem_region(struct device *dev)
+{
+	return dev->type == &cxl_pmem_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL);
+
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev),
+			  "not a cxl_pmem_region device\n"))
+		return NULL;
+	return container_of(dev, struct cxl_pmem_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL);
+
+static struct lock_class_key cxl_pmem_region_key;
+
+static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
+{
+	struct cxl_region_params *p = &cxlr->params;
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int i;
+
+	down_read(&cxl_region_rwsem);
+	if (p->state != CXL_CONFIG_COMMIT) {
+		cxlr_pmem = ERR_PTR(-ENXIO);
+		goto out;
+	}
+
+	cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets),
+			    GFP_KERNEL);
+	if (!cxlr_pmem) {
+		cxlr_pmem = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	cxlr_pmem->hpa_range.start = p->res->start;
+	cxlr_pmem->hpa_range.end = p->res->end;
+
+	/* Snapshot the region configuration underneath the cxl_region_rwsem */
+	cxlr_pmem->nr_mappings = p->nr_targets;
+	for (i = 0; i < p->nr_targets; i++) {
+		struct cxl_endpoint_decoder *cxled = p->targets[i];
+		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+
+		m->cxlmd = cxlmd;
+		get_device(&cxlmd->dev);
+		m->start = cxled->dpa_res->start;
+		m->size = resource_size(cxled->dpa_res);
+		m->position = i;
+	}
+
+	dev = &cxlr_pmem->dev;
+	cxlr_pmem->cxlr = cxlr;
+	device_initialize(dev);
+	lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
+	device_set_pm_not_required(dev);
+	dev->parent = &cxlr->dev;
+	dev->bus = &cxl_bus_type;
+	dev->type = &cxl_pmem_region_type;
+out:
+	up_read(&cxl_region_rwsem);
+
+	return cxlr_pmem;
+}
+
+static void cxlr_pmem_unregister(void *dev)
+{
+	device_unregister(dev);
+}
+
+/**
+ * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge
+ * @cxlr: parent CXL region for this pmem region bridge device
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+	struct device *dev;
+	int rc;
+
+	cxlr_pmem = cxl_pmem_region_alloc(cxlr);
+	if (IS_ERR(cxlr_pmem))
+		return PTR_ERR(cxlr_pmem);
+
+	dev = &cxlr_pmem->dev;
+	rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
+	if (rc)
+		goto err;
+
+	rc = device_add(dev);
+	if (rc)
+		goto err;
+
+	dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+		dev_name(dev));
+
+	return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev);
+
+err:
+	put_device(dev);
+	return rc;
+}
+
+static int cxl_region_probe(struct device *dev)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc) {
+		dev_dbg(&cxlr->dev, "probe interrupted\n");
+		return rc;
+	}
+
+	if (p->state < CXL_CONFIG_COMMIT) {
+		dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
+		rc = -ENXIO;
+	}
+
+	/*
+	 * From this point on any path that changes the region's state away from
+	 * CXL_CONFIG_COMMIT is also responsible for releasing the driver.
+	 */
+	up_read(&cxl_region_rwsem);
+
+	switch (cxlr->mode) {
+	case CXL_DECODER_PMEM:
+		return devm_cxl_add_pmem_region(cxlr);
+	default:
+		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
+			cxlr->mode);
+		return -ENXIO;
+	}
+}
+
+static struct cxl_driver cxl_region_driver = {
+	.name = "cxl_region",
+	.probe = cxl_region_probe,
+	.id = CXL_DEVICE_REGION,
+};
+
+int cxl_region_init(void)
+{
+	return cxl_driver_register(&cxl_region_driver);
+}
+
+void cxl_region_exit(void)
+{
+	cxl_driver_unregister(&cxl_region_driver);
+}
+
+MODULE_IMPORT_NS(CXL);
+MODULE_ALIAS_CXL(CXL_DEVICE_REGION);
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 6799b27c7db2..f680450f0b16 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -7,6 +7,7 @@
 #include <linux/libnvdimm.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 #include <linux/io.h>
 
 /**
@@ -53,9 +54,12 @@
 #define   CXL_HDM_DECODER0_CTRL_LOCK BIT(8)
 #define   CXL_HDM_DECODER0_CTRL_COMMIT BIT(9)
 #define   CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10)
+#define   CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11)
 #define   CXL_HDM_DECODER0_CTRL_TYPE BIT(12)
 #define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24)
 #define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28)
+#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i)
+#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i)
 
 static inline int cxl_hdm_decoder_count(u32 cap_hdr)
 {
@@ -64,6 +68,57 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
 	return val ? val * 2 : 1;
 }
 
+/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */
+static inline int cxl_to_granularity(u16 ig, unsigned int *val)
+{
+	if (ig > 6)
+		return -EINVAL;
+	*val = 256 << ig;
+	return 0;
+}
+
+/* Encode defined in CXL ECN "3, 6, 12 and 16-way memory Interleaving" */
+static inline int cxl_to_ways(u8 eniw, unsigned int *val)
+{
+	switch (eniw) {
+	case 0 ... 4:
+		*val = 1 << eniw;
+		break;
+	case 8 ... 10:
+		*val = 3 << (eniw - 8);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline int granularity_to_cxl(int g, u16 *ig)
+{
+	if (g > SZ_16K || g < 256 || !is_power_of_2(g))
+		return -EINVAL;
+	*ig = ilog2(g) - 8;
+	return 0;
+}
+
+static inline int ways_to_cxl(unsigned int ways, u8 *iw)
+{
+	if (ways > 16)
+		return -EINVAL;
+	if (is_power_of_2(ways)) {
+		*iw = ilog2(ways);
+		return 0;
+	}
+	if (ways % 3)
+		return -EINVAL;
+	ways /= 3;
+	if (!is_power_of_2(ways))
+		return -EINVAL;
+	*iw = ilog2(ways) + 8;
+	return 0;
+}
+
 /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
 #define CXLDEV_CAP_ARRAY_OFFSET 0x0
 #define   CXLDEV_CAP_ARRAY_CAP_ID 0
@@ -193,31 +248,77 @@ enum cxl_decoder_type {
  */
 #define CXL_DECODER_MAX_INTERLEAVE 16
 
+#define CXL_DECODER_MIN_GRANULARITY 256
+
 /**
- * struct cxl_decoder - CXL address range decode configuration
+ * struct cxl_decoder - Common CXL HDM Decoder Attributes
  * @dev: this decoder's device
  * @id: kernel device name id
- * @platform_res: address space resources considered by root decoder
- * @decoder_range: address space resources considered by midlevel decoder
+ * @hpa_range: Host physical address range mapped by this decoder
  * @interleave_ways: number of cxl_dports in this decode
  * @interleave_granularity: data stride per dport
  * @target_type: accelerator vs expander (type2 vs type3) selector
+ * @region: currently assigned region for this decoder
  * @flags: memory type capabilities and locking
- * @target_lock: coordinate coherent reads of the target list
- * @nr_targets: number of elements in @target
- * @target: active ordered target list in current decoder configuration
- */
+ * @commit: device/decoder-type specific callback to commit settings to hw
+ * @reset: device/decoder-type specific callback to reset hw settings
+*/
 struct cxl_decoder {
 	struct device dev;
 	int id;
-	union {
-		struct resource platform_res;
-		struct range decoder_range;
-	};
+	struct range hpa_range;
 	int interleave_ways;
 	int interleave_granularity;
 	enum cxl_decoder_type target_type;
+	struct cxl_region *region;
 	unsigned long flags;
+	int (*commit)(struct cxl_decoder *cxld);
+	int (*reset)(struct cxl_decoder *cxld);
+};
+
+/*
+ * CXL_DECODER_DEAD prevents endpoints from being reattached to regions
+ * while cxld_unregister() is running
+ */
+enum cxl_decoder_mode {
+	CXL_DECODER_NONE,
+	CXL_DECODER_RAM,
+	CXL_DECODER_PMEM,
+	CXL_DECODER_MIXED,
+	CXL_DECODER_DEAD,
+};
+
+/**
+ * struct cxl_endpoint_decoder - Endpoint  / SPA to DPA decoder
+ * @cxld: base cxl_decoder_object
+ * @dpa_res: actively claimed DPA span of this decoder
+ * @skip: offset into @dpa_res where @cxld.hpa_range maps
+ * @mode: which memory type / access-mode-partition this decoder targets
+ * @pos: interleave position in @cxld.region
+ */
+struct cxl_endpoint_decoder {
+	struct cxl_decoder cxld;
+	struct resource *dpa_res;
+	resource_size_t skip;
+	enum cxl_decoder_mode mode;
+	int pos;
+};
+
+/**
+ * struct cxl_switch_decoder - Switch specific CXL HDM Decoder
+ * @cxld: base cxl_decoder object
+ * @target_lock: coordinate coherent reads of the target list
+ * @nr_targets: number of elements in @target
+ * @target: active ordered target list in current decoder configuration
+ *
+ * The 'switch' decoder type represents the decoder instances of cxl_port's that
+ * route from the root of a CXL memory decode topology to the endpoints. They
+ * come in two flavors, root-level decoders, statically defined by platform
+ * firmware, and mid-level decoders, where interleave-granularity,
+ * interleave-width, and the target list are mutable.
+ */
+struct cxl_switch_decoder {
+	struct cxl_decoder cxld;
 	seqlock_t target_lock;
 	int nr_targets;
 	struct cxl_dport *target[];
@@ -225,6 +326,76 @@ struct cxl_decoder {
 
 
 /**
+ * struct cxl_root_decoder - Static platform CXL address decoder
+ * @res: host / parent resource for region allocations
+ * @region_id: region id for next region provisioning event
+ * @calc_hb: which host bridge covers the n'th position by granularity
+ * @cxlsd: base cxl switch decoder
+ */
+struct cxl_root_decoder {
+	struct resource *res;
+	atomic_t region_id;
+	struct cxl_dport *(*calc_hb)(struct cxl_root_decoder *cxlrd, int pos);
+	struct cxl_switch_decoder cxlsd;
+};
+
+/*
+ * enum cxl_config_state - State machine for region configuration
+ * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely
+ * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more
+ * changes to interleave_ways or interleave_granularity
+ * @CXL_CONFIG_ACTIVE: All targets have been added the region is now
+ * active
+ * @CXL_CONFIG_RESET_PENDING: see commit_store()
+ * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware
+ */
+enum cxl_config_state {
+	CXL_CONFIG_IDLE,
+	CXL_CONFIG_INTERLEAVE_ACTIVE,
+	CXL_CONFIG_ACTIVE,
+	CXL_CONFIG_RESET_PENDING,
+	CXL_CONFIG_COMMIT,
+};
+
+/**
+ * struct cxl_region_params - region settings
+ * @state: allow the driver to lockdown further parameter changes
+ * @uuid: unique id for persistent regions
+ * @interleave_ways: number of endpoints in the region
+ * @interleave_granularity: capacity each endpoint contributes to a stripe
+ * @res: allocated iomem capacity for this region
+ * @targets: active ordered targets in current decoder configuration
+ * @nr_targets: number of targets
+ *
+ * State transitions are protected by the cxl_region_rwsem
+ */
+struct cxl_region_params {
+	enum cxl_config_state state;
+	uuid_t uuid;
+	int interleave_ways;
+	int interleave_granularity;
+	struct resource *res;
+	struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE];
+	int nr_targets;
+};
+
+/**
+ * struct cxl_region - CXL region
+ * @dev: This region's device
+ * @id: This region's id. Id is globally unique across all regions
+ * @mode: Endpoint decoder allocation / access mode
+ * @type: Endpoint decoder target type
+ * @params: active + config params for the region
+ */
+struct cxl_region {
+	struct device dev;
+	int id;
+	enum cxl_decoder_mode mode;
+	enum cxl_decoder_type type;
+	struct cxl_region_params params;
+};
+
+/**
  * enum cxl_nvdimm_brige_state - state machine for managing bus rescans
  * @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed
  * @CXL_NVB_DEAD: Set at brige unregistration to preclude async probing
@@ -251,7 +422,26 @@ struct cxl_nvdimm_bridge {
 struct cxl_nvdimm {
 	struct device dev;
 	struct cxl_memdev *cxlmd;
-	struct nvdimm *nvdimm;
+	struct cxl_nvdimm_bridge *bridge;
+	struct cxl_pmem_region *region;
+};
+
+struct cxl_pmem_region_mapping {
+	struct cxl_memdev *cxlmd;
+	struct cxl_nvdimm *cxl_nvd;
+	u64 start;
+	u64 size;
+	int position;
+};
+
+struct cxl_pmem_region {
+	struct device dev;
+	struct cxl_region *cxlr;
+	struct nd_region *nd_region;
+	struct cxl_nvdimm_bridge *bridge;
+	struct range hpa_range;
+	int nr_mappings;
+	struct cxl_pmem_region_mapping mapping[];
 };
 
 /**
@@ -260,50 +450,94 @@ struct cxl_nvdimm {
  *		     decode hierarchy.
  * @dev: this port's device
  * @uport: PCI or platform device implementing the upstream port capability
+ * @host_bridge: Shortcut to the platform attach point for this port
  * @id: id for port device-name
  * @dports: cxl_dport instances referenced by decoders
  * @endpoints: cxl_ep instances, endpoints that are a descendant of this port
+ * @regions: cxl_region_ref instances, regions mapped by this port
+ * @parent_dport: dport that points to this port in the parent
  * @decoder_ida: allocator for decoder ids
+ * @hdm_end: track last allocated HDM decoder instance for allocation ordering
+ * @commit_end: cursor to track highest committed decoder for commit ordering
  * @component_reg_phys: component register capability base address (optional)
  * @dead: last ep has been removed, force port re-creation
  * @depth: How deep this port is relative to the root. depth 0 is the root.
+ * @cdat: Cached CDAT data
+ * @cdat_available: Should a CDAT attribute be available in sysfs
  */
 struct cxl_port {
 	struct device dev;
 	struct device *uport;
+	struct device *host_bridge;
 	int id;
-	struct list_head dports;
-	struct list_head endpoints;
+	struct xarray dports;
+	struct xarray endpoints;
+	struct xarray regions;
+	struct cxl_dport *parent_dport;
 	struct ida decoder_ida;
+	int hdm_end;
+	int commit_end;
 	resource_size_t component_reg_phys;
 	bool dead;
 	unsigned int depth;
+	struct cxl_cdat {
+		void *table;
+		size_t length;
+	} cdat;
+	bool cdat_available;
 };
 
+static inline struct cxl_dport *
+cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev)
+{
+	return xa_load(&port->dports, (unsigned long)dport_dev);
+}
+
 /**
  * struct cxl_dport - CXL downstream port
  * @dport: PCI bridge or firmware device representing the downstream link
  * @port_id: unique hardware identifier for dport in decoder target list
  * @component_reg_phys: downstream port component registers
  * @port: reference to cxl_port that contains this downstream port
- * @list: node for a cxl_port's list of cxl_dport instances
  */
 struct cxl_dport {
 	struct device *dport;
 	int port_id;
 	resource_size_t component_reg_phys;
 	struct cxl_port *port;
-	struct list_head list;
 };
 
 /**
  * struct cxl_ep - track an endpoint's interest in a port
  * @ep: device that hosts a generic CXL endpoint (expander or accelerator)
- * @list: node on port->endpoints list
+ * @dport: which dport routes to this endpoint on @port
+ * @next: cxl switch port across the link attached to @dport NULL if
+ *	  attached to an endpoint
  */
 struct cxl_ep {
 	struct device *ep;
-	struct list_head list;
+	struct cxl_dport *dport;
+	struct cxl_port *next;
+};
+
+/**
+ * struct cxl_region_ref - track a region's interest in a port
+ * @port: point in topology to install this reference
+ * @decoder: decoder assigned for @region in @port
+ * @region: region for this reference
+ * @endpoints: cxl_ep references for region members beneath @port
+ * @nr_targets_set: track how many targets have been programmed during setup
+ * @nr_eps: number of endpoints beneath @port
+ * @nr_targets: number of distinct targets needed to reach @nr_eps
+ */
+struct cxl_region_ref {
+	struct cxl_port *port;
+	struct cxl_decoder *decoder;
+	struct cxl_region *region;
+	struct xarray endpoints;
+	int nr_targets_set;
+	int nr_eps;
+	int nr_targets;
 };
 
 /*
@@ -325,29 +559,31 @@ int devm_cxl_register_pci_bus(struct device *host, struct device *uport,
 struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port);
 struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport,
 				   resource_size_t component_reg_phys,
-				   struct cxl_port *parent_port);
+				   struct cxl_dport *parent_dport);
+int devm_cxl_add_endpoint(struct cxl_memdev *cxlmd,
+			  struct cxl_dport *parent_dport);
 struct cxl_port *find_cxl_root(struct device *dev);
 int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
 int cxl_bus_rescan(void);
-struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd);
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+				   struct cxl_dport **dport);
 bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd);
 
 struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
 				     struct device *dport, int port_id,
 				     resource_size_t component_reg_phys);
-struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port,
-					const struct device *dev);
 
 struct cxl_decoder *to_cxl_decoder(struct device *dev);
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev);
 bool is_root_decoder(struct device *dev);
 bool is_endpoint_decoder(struct device *dev);
-bool is_cxl_decoder(struct device *dev);
-struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
-					   unsigned int nr_targets);
-struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
-					     unsigned int nr_targets);
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+						unsigned int nr_targets);
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+						    unsigned int nr_targets);
 int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map);
-struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port);
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port);
 int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map);
 int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld);
 int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
@@ -357,6 +593,8 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port);
 int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm);
 int devm_cxl_add_passthrough_decoder(struct cxl_port *port);
 
+bool is_cxl_region(struct device *dev);
+
 extern struct bus_type cxl_bus_type;
 
 struct cxl_driver {
@@ -385,6 +623,8 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv);
 #define CXL_DEVICE_PORT			3
 #define CXL_DEVICE_ROOT			4
 #define CXL_DEVICE_MEMORY_EXPANDER	5
+#define CXL_DEVICE_REGION		6
+#define CXL_DEVICE_PMEM_REGION		7
 
 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*")
 #define CXL_MODALIAS_FMT "cxl:t%d"
@@ -396,7 +636,21 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm(struct device *dev);
 bool is_cxl_nvdimm_bridge(struct device *dev);
 int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
-struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev);
+
+#ifdef CONFIG_CXL_REGION
+bool is_cxl_pmem_region(struct device *dev);
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
+#else
+static inline bool is_cxl_pmem_region(struct device *dev)
+{
+	return false;
+}
+static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+	return NULL;
+}
+#endif
 
 /*
  * Unit test builds overrides this to __weak, find the 'strong' version
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 7df0b053373a..88e3a8e54b6a 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -50,6 +50,24 @@ static inline struct cxl_memdev *to_cxl_memdev(struct device *dev)
 	return container_of(dev, struct cxl_memdev, dev);
 }
 
+static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled)
+{
+	return to_cxl_port(cxled->cxld.dev.parent);
+}
+
+static inline struct cxl_port *cxlrd_to_port(struct cxl_root_decoder *cxlrd)
+{
+	return to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+}
+
+static inline struct cxl_memdev *
+cxled_to_memdev(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent);
+
+	return to_cxl_memdev(port->uport);
+}
+
 bool is_cxl_memdev(struct device *dev);
 static inline bool is_cxl_endpoint(struct cxl_port *port)
 {
@@ -178,8 +196,9 @@ struct cxl_endpoint_dvsec_info {
  * @firmware_version: Firmware version for the memory device.
  * @enabled_cmds: Hardware commands found enabled in CEL.
  * @exclusive_cmds: Commands that are kernel-internal only
- * @pmem_range: Active Persistent memory capacity configuration
- * @ram_range: Active Volatile memory capacity configuration
+ * @dpa_res: Overall DPA resource tree for the device
+ * @pmem_res: Active Persistent memory capacity configuration
+ * @ram_res: Active Volatile memory capacity configuration
  * @total_bytes: sum of all possible capacities
  * @volatile_only_bytes: hard volatile capacity
  * @persistent_only_bytes: hard persistent capacity
@@ -191,6 +210,7 @@ struct cxl_endpoint_dvsec_info {
  * @component_reg_phys: register base of component registers
  * @info: Cached DVSEC information about the device.
  * @serial: PCIe Device Serial Number
+ * @doe_mbs: PCI DOE mailbox array
  * @mbox_send: @dev specific transport for transmitting mailbox commands
  *
  * See section 8.2.9.5.2 Capacity Configuration and Label Storage for
@@ -209,8 +229,9 @@ struct cxl_dev_state {
 	DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX);
 	DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
 
-	struct range pmem_range;
-	struct range ram_range;
+	struct resource dpa_res;
+	struct resource pmem_res;
+	struct resource ram_res;
 	u64 total_bytes;
 	u64 volatile_only_bytes;
 	u64 persistent_only_bytes;
@@ -224,6 +245,8 @@ struct cxl_dev_state {
 	resource_size_t component_reg_phys;
 	u64 serial;
 
+	struct xarray doe_mbs;
+
 	int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd);
 };
 
@@ -299,6 +322,13 @@ struct cxl_mbox_identify {
 	u8 qos_telemetry_caps;
 } __packed;
 
+struct cxl_mbox_get_partition_info {
+	__le64 active_volatile_cap;
+	__le64 active_persistent_cap;
+	__le64 next_volatile_cap;
+	__le64 next_persistent_cap;
+} __packed;
+
 struct cxl_mbox_get_lsa {
 	__le32 offset;
 	__le32 length;
@@ -370,4 +400,8 @@ struct cxl_hdm {
 	unsigned int interleave_mask;
 	struct cxl_port *port;
 };
+
+struct seq_file;
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds);
 #endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index fce1c11729c2..eec597dbe763 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -74,4 +74,5 @@ static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev,
 int devm_cxl_port_enumerate_dports(struct cxl_port *port);
 struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm);
+void read_cdat_data(struct cxl_port *port);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index a979d0b484d5..64ccf053d32c 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -24,42 +25,32 @@
  * in higher level operations.
  */
 
-static int create_endpoint(struct cxl_memdev *cxlmd,
-			   struct cxl_port *parent_port)
+static void enable_suspend(void *data)
 {
-	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	struct cxl_port *endpoint;
-	int rc;
+	cxl_mem_active_dec();
+}
 
-	endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev,
-				     cxlds->component_reg_phys, parent_port);
-	if (IS_ERR(endpoint))
-		return PTR_ERR(endpoint);
+static void remove_debugfs(void *dentry)
+{
+	debugfs_remove_recursive(dentry);
+}
 
-	dev_dbg(&cxlmd->dev, "add: %s\n", dev_name(&endpoint->dev));
+static int cxl_mem_dpa_show(struct seq_file *file, void *data)
+{
+	struct device *dev = file->private;
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 
-	rc = cxl_endpoint_autoremove(cxlmd, endpoint);
-	if (rc)
-		return rc;
-
-	if (!endpoint->dev.driver) {
-		dev_err(&cxlmd->dev, "%s failed probe\n",
-			dev_name(&endpoint->dev));
-		return -ENXIO;
-	}
+	cxl_dpa_debug(file, cxlmd->cxlds);
 
 	return 0;
 }
 
-static void enable_suspend(void *data)
-{
-	cxl_mem_active_dec();
-}
-
 static int cxl_mem_probe(struct device *dev)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
 	struct cxl_port *parent_port;
+	struct cxl_dport *dport;
+	struct dentry *dentry;
 	int rc;
 
 	/*
@@ -73,11 +64,17 @@ static int cxl_mem_probe(struct device *dev)
 	if (work_pending(&cxlmd->detach_work))
 		return -EBUSY;
 
+	dentry = cxl_debugfs_create_dir(dev_name(dev));
+	debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show);
+	rc = devm_add_action_or_reset(dev, remove_debugfs, dentry);
+	if (rc)
+		return rc;
+
 	rc = devm_cxl_enumerate_ports(cxlmd);
 	if (rc)
 		return rc;
 
-	parent_port = cxl_mem_find_port(cxlmd);
+	parent_port = cxl_mem_find_port(cxlmd, &dport);
 	if (!parent_port) {
 		dev_err(dev, "CXL port topology not found\n");
 		return -ENXIO;
@@ -91,7 +88,7 @@ static int cxl_mem_probe(struct device *dev)
 		goto unlock;
 	}
 
-	rc = create_endpoint(cxlmd, parent_port);
+	rc = devm_cxl_add_endpoint(cxlmd, dport);
 unlock:
 	device_unlock(&parent_port->dev);
 	put_device(&parent_port->dev);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 5a0ae46d4989..faeb5d9d7a7a 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -8,6 +8,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/pci.h>
+#include <linux/pci-doe.h>
 #include <linux/io.h>
 #include "cxlmem.h"
 #include "cxlpci.h"
@@ -386,6 +387,47 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
 	return rc;
 }
 
+static void cxl_pci_destroy_doe(void *mbs)
+{
+	xa_destroy(mbs);
+}
+
+static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds)
+{
+	struct device *dev = cxlds->dev;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	u16 off = 0;
+
+	xa_init(&cxlds->doe_mbs);
+	if (devm_add_action(&pdev->dev, cxl_pci_destroy_doe, &cxlds->doe_mbs)) {
+		dev_err(dev, "Failed to create XArray for DOE's\n");
+		return;
+	}
+
+	/*
+	 * Mailbox creation is best effort.  Higher layers must determine if
+	 * the lack of a mailbox for their protocol is a device failure or not.
+	 */
+	pci_doe_for_each_off(pdev, off) {
+		struct pci_doe_mb *doe_mb;
+
+		doe_mb = pcim_doe_create_mb(pdev, off);
+		if (IS_ERR(doe_mb)) {
+			dev_err(dev, "Failed to create MB object for MB @ %x\n",
+				off);
+			continue;
+		}
+
+		if (xa_insert(&cxlds->doe_mbs, off, doe_mb, GFP_KERNEL)) {
+			dev_err(dev, "xa_insert failed to insert MB @ %x\n",
+				off);
+			continue;
+		}
+
+		dev_dbg(dev, "Created DOE mailbox @%x\n", off);
+	}
+}
+
 static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct cxl_register_map map;
@@ -434,6 +476,8 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map);
 
+	devm_cxl_pci_create_doe(cxlds);
+
 	rc = cxl_pci_setup_mailbox(cxlds);
 	if (rc)
 		return rc;
@@ -454,7 +498,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-	if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
+	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
 		rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
 
 	return rc;
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 0aaa70b4e0f7..7dc0a2fa1a6b 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -7,6 +7,7 @@
 #include <linux/ndctl.h>
 #include <linux/async.h>
 #include <linux/slab.h>
+#include <linux/nd.h>
 #include "cxlmem.h"
 #include "cxl.h"
 
@@ -26,7 +27,23 @@ static void clear_exclusive(void *cxlds)
 
 static void unregister_nvdimm(void *nvdimm)
 {
+	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+	struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
+	struct cxl_pmem_region *cxlr_pmem;
+
+	device_lock(&cxl_nvb->dev);
+	cxlr_pmem = cxl_nvd->region;
+	dev_set_drvdata(&cxl_nvd->dev, NULL);
+	cxl_nvd->region = NULL;
+	device_unlock(&cxl_nvb->dev);
+
+	if (cxlr_pmem) {
+		device_release_driver(&cxlr_pmem->dev);
+		put_device(&cxlr_pmem->dev);
+	}
+
 	nvdimm_delete(nvdimm);
+	cxl_nvd->bridge = NULL;
 }
 
 static int cxl_nvdimm_probe(struct device *dev)
@@ -39,7 +56,7 @@ static int cxl_nvdimm_probe(struct device *dev)
 	struct nvdimm *nvdimm;
 	int rc;
 
-	cxl_nvb = cxl_find_nvdimm_bridge(cxl_nvd);
+	cxl_nvb = cxl_find_nvdimm_bridge(dev);
 	if (!cxl_nvb)
 		return -ENXIO;
 
@@ -66,6 +83,7 @@ static int cxl_nvdimm_probe(struct device *dev)
 	}
 
 	dev_set_drvdata(dev, nvdimm);
+	cxl_nvd->bridge = cxl_nvb;
 	rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
 out:
 	device_unlock(&cxl_nvb->dev);
@@ -204,15 +222,38 @@ static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb)
 	return cxl_nvb->nvdimm_bus != NULL;
 }
 
-static int cxl_nvdimm_release_driver(struct device *dev, void *data)
+static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb)
 {
+	struct cxl_nvdimm *cxl_nvd;
+
 	if (!is_cxl_nvdimm(dev))
 		return 0;
+
+	cxl_nvd = to_cxl_nvdimm(dev);
+	if (cxl_nvd->bridge != cxl_nvb)
+		return 0;
+
 	device_release_driver(dev);
 	return 0;
 }
 
-static void offline_nvdimm_bus(struct nvdimm_bus *nvdimm_bus)
+static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb)
+{
+	struct cxl_pmem_region *cxlr_pmem;
+
+	if (!is_cxl_pmem_region(dev))
+		return 0;
+
+	cxlr_pmem = to_cxl_pmem_region(dev);
+	if (cxlr_pmem->bridge != cxl_nvb)
+		return 0;
+
+	device_release_driver(dev);
+	return 0;
+}
+
+static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
+			       struct nvdimm_bus *nvdimm_bus)
 {
 	if (!nvdimm_bus)
 		return;
@@ -222,7 +263,10 @@ static void offline_nvdimm_bus(struct nvdimm_bus *nvdimm_bus)
 	 * nvdimm_bus_unregister() rips the nvdimm objects out from
 	 * underneath them.
 	 */
-	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_release_driver);
+	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
+			 cxl_pmem_region_release_driver);
+	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
+			 cxl_nvdimm_release_driver);
 	nvdimm_bus_unregister(nvdimm_bus);
 }
 
@@ -260,7 +304,7 @@ static void cxl_nvb_update_state(struct work_struct *work)
 
 		dev_dbg(&cxl_nvb->dev, "rescan: %d\n", rc);
 	}
-	offline_nvdimm_bus(victim_bus);
+	offline_nvdimm_bus(cxl_nvb, victim_bus);
 
 	put_device(&cxl_nvb->dev);
 }
@@ -315,6 +359,203 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = {
 	.id = CXL_DEVICE_NVDIMM_BRIDGE,
 };
 
+static int match_cxl_nvdimm(struct device *dev, void *data)
+{
+	return is_cxl_nvdimm(dev);
+}
+
+static void unregister_nvdimm_region(void *nd_region)
+{
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct cxl_pmem_region *cxlr_pmem;
+	int i;
+
+	cxlr_pmem = nd_region_provider_data(nd_region);
+	cxl_nvb = cxlr_pmem->bridge;
+	device_lock(&cxl_nvb->dev);
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;
+
+		if (cxl_nvd->region) {
+			put_device(&cxlr_pmem->dev);
+			cxl_nvd->region = NULL;
+		}
+	}
+	device_unlock(&cxl_nvb->dev);
+
+	nvdimm_region_delete(nd_region);
+}
+
+static void cxlr_pmem_remove_resource(void *res)
+{
+	remove_resource(res);
+}
+
+struct cxl_pmem_region_info {
+	u64 offset;
+	u64 serial;
+};
+
+static int cxl_pmem_region_probe(struct device *dev)
+{
+	struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
+	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+	struct cxl_region *cxlr = cxlr_pmem->cxlr;
+	struct cxl_pmem_region_info *info = NULL;
+	struct cxl_nvdimm_bridge *cxl_nvb;
+	struct nd_interleave_set *nd_set;
+	struct nd_region_desc ndr_desc;
+	struct cxl_nvdimm *cxl_nvd;
+	struct nvdimm *nvdimm;
+	struct resource *res;
+	int rc, i = 0;
+
+	cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev);
+	if (!cxl_nvb) {
+		dev_dbg(dev, "bridge not found\n");
+		return -ENXIO;
+	}
+	cxlr_pmem->bridge = cxl_nvb;
+
+	device_lock(&cxl_nvb->dev);
+	if (!cxl_nvb->nvdimm_bus) {
+		dev_dbg(dev, "nvdimm bus not found\n");
+		rc = -ENXIO;
+		goto err;
+	}
+
+	memset(&mappings, 0, sizeof(mappings));
+	memset(&ndr_desc, 0, sizeof(ndr_desc));
+
+	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+	if (!res) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	res->name = "Persistent Memory";
+	res->start = cxlr_pmem->hpa_range.start;
+	res->end = cxlr_pmem->hpa_range.end;
+	res->flags = IORESOURCE_MEM;
+	res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+	rc = insert_resource(&iomem_resource, res);
+	if (rc)
+		goto err;
+
+	rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+	if (rc)
+		goto err;
+
+	ndr_desc.res = res;
+	ndr_desc.provider_data = cxlr_pmem;
+
+	ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
+	ndr_desc.target_node = phys_to_target_node(res->start);
+	if (ndr_desc.target_node == NUMA_NO_NODE) {
+		ndr_desc.target_node = ndr_desc.numa_node;
+		dev_dbg(&cxlr->dev, "changing target node from %d to %d",
+			NUMA_NO_NODE, ndr_desc.target_node);
+	}
+
+	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+	if (!nd_set) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ndr_desc.memregion = cxlr->id;
+	set_bit(ND_REGION_CXL, &ndr_desc.flags);
+	set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+
+	info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+		struct cxl_memdev *cxlmd = m->cxlmd;
+		struct cxl_dev_state *cxlds = cxlmd->cxlds;
+		struct device *d;
+
+		d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm);
+		if (!d) {
+			dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+
+		/* safe to drop ref now with bridge lock held */
+		put_device(d);
+
+		cxl_nvd = to_cxl_nvdimm(d);
+		nvdimm = dev_get_drvdata(&cxl_nvd->dev);
+		if (!nvdimm) {
+			dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+				dev_name(&cxlmd->dev));
+			rc = -ENODEV;
+			goto err;
+		}
+		cxl_nvd->region = cxlr_pmem;
+		get_device(&cxlr_pmem->dev);
+		m->cxl_nvd = cxl_nvd;
+		mappings[i] = (struct nd_mapping_desc) {
+			.nvdimm = nvdimm,
+			.start = m->start,
+			.size = m->size,
+			.position = i,
+		};
+		info[i].offset = m->start;
+		info[i].serial = cxlds->serial;
+	}
+	ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
+	ndr_desc.mapping = mappings;
+
+	/*
+	 * TODO enable CXL labels which skip the need for 'interleave-set cookie'
+	 */
+	nd_set->cookie1 =
+		nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
+	nd_set->cookie2 = nd_set->cookie1;
+	ndr_desc.nd_set = nd_set;
+
+	cxlr_pmem->nd_region =
+		nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+	if (!cxlr_pmem->nd_region) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+				      cxlr_pmem->nd_region);
+out:
+	kfree(info);
+	device_unlock(&cxl_nvb->dev);
+	put_device(&cxl_nvb->dev);
+
+	return rc;
+
+err:
+	dev_dbg(dev, "failed to create nvdimm region\n");
+	for (i--; i >= 0; i--) {
+		nvdimm = mappings[i].nvdimm;
+		cxl_nvd = nvdimm_provider_data(nvdimm);
+		put_device(&cxl_nvd->region->dev);
+		cxl_nvd->region = NULL;
+	}
+	goto out;
+}
+
+static struct cxl_driver cxl_pmem_region_driver = {
+	.name = "cxl_pmem_region",
+	.probe = cxl_pmem_region_probe,
+	.id = CXL_DEVICE_PMEM_REGION,
+};
+
 /*
  * Return all bridges to the CXL_NVB_NEW state to invalidate any
  * ->state_work referring to the now destroyed cxl_pmem_wq.
@@ -359,8 +600,14 @@ static __init int cxl_pmem_init(void)
 	if (rc)
 		goto err_nvdimm;
 
+	rc = cxl_driver_register(&cxl_pmem_region_driver);
+	if (rc)
+		goto err_region;
+
 	return 0;
 
+err_region:
+	cxl_driver_unregister(&cxl_nvdimm_driver);
 err_nvdimm:
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 err_bridge:
@@ -370,6 +617,7 @@ err_bridge:
 
 static __exit void cxl_pmem_exit(void)
 {
+	cxl_driver_unregister(&cxl_pmem_region_driver);
 	cxl_driver_unregister(&cxl_nvdimm_driver);
 	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
 	destroy_cxl_pmem_wq();
@@ -381,3 +629,4 @@ module_exit(cxl_pmem_exit);
 MODULE_IMPORT_NS(CXL);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
 MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
+MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 3cf308f114c4..5453771bf330 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -53,6 +53,9 @@ static int cxl_port_probe(struct device *dev)
 		struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport);
 		struct cxl_dev_state *cxlds = cxlmd->cxlds;
 
+		/* Cache the data early to ensure is_visible() works */
+		read_cdat_data(port);
+
 		get_device(&cxlmd->dev);
 		rc = devm_add_action_or_reset(dev, schedule_detach, cxlmd);
 		if (rc)
@@ -78,10 +81,60 @@ static int cxl_port_probe(struct device *dev)
 	return 0;
 }
 
+static ssize_t CDAT_read(struct file *filp, struct kobject *kobj,
+			 struct bin_attribute *bin_attr, char *buf,
+			 loff_t offset, size_t count)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_port *port = to_cxl_port(dev);
+
+	if (!port->cdat_available)
+		return -ENXIO;
+
+	if (!port->cdat.table)
+		return 0;
+
+	return memory_read_from_buffer(buf, count, &offset,
+				       port->cdat.table,
+				       port->cdat.length);
+}
+
+static BIN_ATTR_ADMIN_RO(CDAT, 0);
+
+static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj,
+					    struct bin_attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_port *port = to_cxl_port(dev);
+
+	if ((attr == &bin_attr_CDAT) && port->cdat_available)
+		return attr->attr.mode;
+
+	return 0;
+}
+
+static struct bin_attribute *cxl_cdat_bin_attributes[] = {
+	&bin_attr_CDAT,
+	NULL,
+};
+
+static struct attribute_group cxl_cdat_attribute_group = {
+	.bin_attrs = cxl_cdat_bin_attributes,
+	.is_bin_visible = cxl_port_bin_attr_is_visible,
+};
+
+static const struct attribute_group *cxl_port_attribute_groups[] = {
+	&cxl_cdat_attribute_group,
+	NULL,
+};
+
 static struct cxl_driver cxl_port_driver = {
 	.name = "cxl_port",
 	.probe = cxl_port_probe,
 	.id = CXL_DEVICE_PORT,
+	.drv = {
+		.dev_groups = cxl_port_attribute_groups,
+	},
 };
 
 module_cxl_driver(cxl_port_driver);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index d976260eca7a..473a71bbd9c9 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -133,7 +133,8 @@ static void nd_region_release(struct device *dev)
 		put_device(&nvdimm->dev);
 	}
 	free_percpu(nd_region->lane);
-	memregion_free(nd_region->id);
+	if (!test_bit(ND_REGION_CXL, &nd_region->flags))
+		memregion_free(nd_region->id);
 	kfree(nd_region);
 }
 
@@ -982,9 +983,14 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	if (!nd_region)
 		return NULL;
-	nd_region->id = memregion_alloc(GFP_KERNEL);
-	if (nd_region->id < 0)
-		goto err_id;
+	/* CXL pre-assigns memregion ids before creating nvdimm regions */
+	if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) {
+		nd_region->id = ndr_desc->memregion;
+	} else {
+		nd_region->id = memregion_alloc(GFP_KERNEL);
+		if (nd_region->id < 0)
+			goto err_id;
+	}
 
 	nd_region->lane = alloc_percpu(struct nd_percpu_lane);
 	if (!nd_region->lane)
@@ -1043,9 +1049,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 
 	return nd_region;
 
- err_percpu:
-	memregion_free(nd_region->id);
- err_id:
+err_percpu:
+	if (!test_bit(ND_REGION_CXL, &ndr_desc->flags))
+		memregion_free(nd_region->id);
+err_id:
 	kfree(nd_region);
 	return NULL;
 }
@@ -1068,6 +1075,13 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+void nvdimm_region_delete(struct nd_region *nd_region)
+{
+	if (nd_region)
+		nd_device_unregister(&nd_region->dev, ND_SYNC);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_delete);
+
 int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
 {
 	int rc = 0;
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 5cc7cba1941f..55c028af4bd9 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -121,6 +121,9 @@ config XEN_PCIDEV_FRONTEND
 config PCI_ATS
 	bool
 
+config PCI_DOE
+	bool
+
 config PCI_ECAM
 	bool
 
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 0da6b1ebc694..2680e4c92f0a 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_PCI_ECAM)		+= ecam.o
 obj-$(CONFIG_PCI_P2PDMA)	+= p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 obj-$(CONFIG_VGA_ARB)		+= vgaarb.o
+obj-$(CONFIG_PCI_DOE)		+= doe.o
 
 # Endpoint library must be initialized before its users
 obj-$(CONFIG_PCI_ENDPOINT)	+= endpoint/
diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
new file mode 100644
index 000000000000..e402f05068a5
--- /dev/null
+++ b/drivers/pci/doe.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *	Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#define dev_fmt(fmt) "DOE: " fmt
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/workqueue.h>
+
+#define PCI_DOE_PROTOCOL_DISCOVERY 0
+
+/* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */
+#define PCI_DOE_TIMEOUT HZ
+#define PCI_DOE_POLL_INTERVAL	(PCI_DOE_TIMEOUT / 128)
+
+#define PCI_DOE_FLAG_CANCEL	0
+#define PCI_DOE_FLAG_DEAD	1
+
+/**
+ * struct pci_doe_mb - State for a single DOE mailbox
+ *
+ * This state is used to manage a single DOE mailbox capability.  All fields
+ * should be considered opaque to the consumers and the structure passed into
+ * the helpers below after being created by devm_pci_doe_create()
+ *
+ * @pdev: PCI device this mailbox belongs to
+ * @cap_offset: Capability offset
+ * @prots: Array of protocols supported (encoded as long values)
+ * @wq: Wait queue for work item
+ * @work_queue: Queue of pci_doe_work items
+ * @flags: Bit array of PCI_DOE_FLAG_* flags
+ */
+struct pci_doe_mb {
+	struct pci_dev *pdev;
+	u16 cap_offset;
+	struct xarray prots;
+
+	wait_queue_head_t wq;
+	struct workqueue_struct *work_queue;
+	unsigned long flags;
+};
+
+static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout)
+{
+	if (wait_event_timeout(doe_mb->wq,
+			       test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags),
+			       timeout))
+		return -EIO;
+	return 0;
+}
+
+static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val);
+}
+
+static int pci_doe_abort(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+
+	pci_dbg(pdev, "[%x] Issuing Abort\n", offset);
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_ABORT);
+
+	do {
+		int rc;
+		u32 val;
+
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc)
+			return rc;
+		pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+
+		/* Abort success! */
+		if (!FIELD_GET(PCI_DOE_STATUS_ERROR, val) &&
+		    !FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+			return 0;
+
+	} while (!time_after(jiffies, timeout_jiffies));
+
+	/* Abort has timed out and the MB is dead */
+	pci_err(pdev, "[%x] ABORT timed out\n", offset);
+	return -EIO;
+}
+
+static int pci_doe_send_req(struct pci_doe_mb *doe_mb,
+			    struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+	int i;
+
+	/*
+	 * Check the DOE busy bit is not set. If it is set, this could indicate
+	 * someone other than Linux (e.g. firmware) is using the mailbox. Note
+	 * it is expected that firmware and OS will negotiate access rights via
+	 * an, as yet to be defined, method.
+	 */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_BUSY, val))
+		return -EBUSY;
+
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	/* Write DOE Header */
+	val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) |
+		FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type);
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val);
+	/* Length is 2 DW of header + length of payload in DW */
+	pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+			       FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH,
+					  2 + task->request_pl_sz /
+						sizeof(u32)));
+	for (i = 0; i < task->request_pl_sz / sizeof(u32); i++)
+		pci_write_config_dword(pdev, offset + PCI_DOE_WRITE,
+				       task->request_pl[i]);
+
+	pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO);
+
+	return 0;
+}
+
+static bool pci_doe_data_obj_ready(struct pci_doe_mb *doe_mb)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	u32 val;
+
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val))
+		return true;
+	return false;
+}
+
+static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	size_t length, payload_length;
+	u32 val;
+	int i;
+
+	/* Read the first dword to get the protocol */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	if ((FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val) != task->prot.vid) ||
+	    (FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val) != task->prot.type)) {
+		dev_err_ratelimited(&pdev->dev, "[%x] expected [VID, Protocol] = [%04x, %02x], got [%04x, %02x]\n",
+				    doe_mb->cap_offset, task->prot.vid, task->prot.type,
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val),
+				    FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val));
+		return -EIO;
+	}
+
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	/* Read the second dword to get the length */
+	pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+	pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+
+	length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val);
+	if (length > SZ_1M || length < 2)
+		return -EIO;
+
+	/* First 2 dwords have already been read */
+	length -= 2;
+	payload_length = min(length, task->response_pl_sz / sizeof(u32));
+	/* Read the rest of the response payload */
+	for (i = 0; i < payload_length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ,
+				      &task->response_pl[i]);
+		/* Prior to the last ack, ensure Data Object Ready */
+		if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb))
+			return -EIO;
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Flush excess length */
+	for (; i < length; i++) {
+		pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val);
+		pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0);
+	}
+
+	/* Final error check to pick up on any since Data Object Ready */
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val))
+		return -EIO;
+
+	return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32);
+}
+
+static void signal_task_complete(struct pci_doe_task *task, int rv)
+{
+	task->rv = rv;
+	task->complete(task);
+}
+
+static void signal_task_abort(struct pci_doe_task *task, int rv)
+{
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+
+	if (pci_doe_abort(doe_mb)) {
+		/*
+		 * If the device can't process an abort; set the mailbox dead
+		 *	- no more submissions
+		 */
+		pci_err(pdev, "[%x] Abort failed marking mailbox dead\n",
+			doe_mb->cap_offset);
+		set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+	}
+	signal_task_complete(task, rv);
+}
+
+static void doe_statemachine_work(struct work_struct *work)
+{
+	struct pci_doe_task *task = container_of(work, struct pci_doe_task,
+						 work);
+	struct pci_doe_mb *doe_mb = task->doe_mb;
+	struct pci_dev *pdev = doe_mb->pdev;
+	int offset = doe_mb->cap_offset;
+	unsigned long timeout_jiffies;
+	u32 val;
+	int rc;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
+		signal_task_complete(task, -EIO);
+		return;
+	}
+
+	/* Send request */
+	rc = pci_doe_send_req(doe_mb, task);
+	if (rc) {
+		/*
+		 * The specification does not provide any guidance on how to
+		 * resolve conflicting requests from other entities.
+		 * Furthermore, it is likely that busy will not be detected
+		 * most of the time.  Flag any detection of status busy with an
+		 * error.
+		 */
+		if (rc == -EBUSY)
+			dev_err_ratelimited(&pdev->dev, "[%x] busy detected; another entity is sending conflicting requests\n",
+					    offset);
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	timeout_jiffies = jiffies + PCI_DOE_TIMEOUT;
+	/* Poll for response */
+retry_resp:
+	pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val);
+	if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) {
+		signal_task_abort(task, -EIO);
+		return;
+	}
+
+	if (!FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) {
+		if (time_after(jiffies, timeout_jiffies)) {
+			signal_task_abort(task, -EIO);
+			return;
+		}
+		rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL);
+		if (rc) {
+			signal_task_abort(task, rc);
+			return;
+		}
+		goto retry_resp;
+	}
+
+	rc  = pci_doe_recv_resp(doe_mb, task);
+	if (rc < 0) {
+		signal_task_abort(task, rc);
+		return;
+	}
+
+	signal_task_complete(task, rc);
+}
+
+static void pci_doe_task_complete(struct pci_doe_task *task)
+{
+	complete(task->private);
+}
+
+static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
+			     u8 *protocol)
+{
+	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
+				    *index);
+	u32 response_pl;
+	DECLARE_COMPLETION_ONSTACK(c);
+	struct pci_doe_task task = {
+		.prot.vid = PCI_VENDOR_ID_PCI_SIG,
+		.prot.type = PCI_DOE_PROTOCOL_DISCOVERY,
+		.request_pl = &request_pl,
+		.request_pl_sz = sizeof(request_pl),
+		.response_pl = &response_pl,
+		.response_pl_sz = sizeof(response_pl),
+		.complete = pci_doe_task_complete,
+		.private = &c,
+	};
+	int rc;
+
+	rc = pci_doe_submit_task(doe_mb, &task);
+	if (rc < 0)
+		return rc;
+
+	wait_for_completion(&c);
+
+	if (task.rv != sizeof(response_pl))
+		return -EIO;
+
+	*vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl);
+	*protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL,
+			      response_pl);
+	*index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX,
+			   response_pl);
+
+	return 0;
+}
+
+static void *pci_doe_xa_prot_entry(u16 vid, u8 prot)
+{
+	return xa_mk_value((vid << 8) | prot);
+}
+
+static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb)
+{
+	u8 index = 0;
+	u8 xa_idx = 0;
+
+	do {
+		int rc;
+		u16 vid;
+		u8 prot;
+
+		rc = pci_doe_discovery(doe_mb, &index, &vid, &prot);
+		if (rc)
+			return rc;
+
+		pci_dbg(doe_mb->pdev,
+			"[%x] Found protocol %d vid: %x prot: %x\n",
+			doe_mb->cap_offset, xa_idx, vid, prot);
+
+		rc = xa_insert(&doe_mb->prots, xa_idx++,
+			       pci_doe_xa_prot_entry(vid, prot), GFP_KERNEL);
+		if (rc)
+			return rc;
+	} while (index);
+
+	return 0;
+}
+
+static void pci_doe_xa_destroy(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	xa_destroy(&doe_mb->prots);
+}
+
+static void pci_doe_destroy_workqueue(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	destroy_workqueue(doe_mb->work_queue);
+}
+
+static void pci_doe_flush_mb(void *mb)
+{
+	struct pci_doe_mb *doe_mb = mb;
+
+	/* Stop all pending work items from starting */
+	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
+
+	/* Cancel an in progress work item, if necessary */
+	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
+	wake_up(&doe_mb->wq);
+
+	/* Flush all work items */
+	flush_workqueue(doe_mb->work_queue);
+}
+
+/**
+ * pcim_doe_create_mb() - Create a DOE mailbox object
+ *
+ * @pdev: PCI device to create the DOE mailbox for
+ * @cap_offset: Offset of the DOE mailbox
+ *
+ * Create a single mailbox object to manage the mailbox protocol at the
+ * cap_offset specified.
+ *
+ * RETURNS: created mailbox object on success
+ *	    ERR_PTR(-errno) on failure
+ */
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset)
+{
+	struct pci_doe_mb *doe_mb;
+	struct device *dev = &pdev->dev;
+	int rc;
+
+	doe_mb = devm_kzalloc(dev, sizeof(*doe_mb), GFP_KERNEL);
+	if (!doe_mb)
+		return ERR_PTR(-ENOMEM);
+
+	doe_mb->pdev = pdev;
+	doe_mb->cap_offset = cap_offset;
+	init_waitqueue_head(&doe_mb->wq);
+
+	xa_init(&doe_mb->prots);
+	rc = devm_add_action(dev, pci_doe_xa_destroy, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0,
+						dev_driver_string(&pdev->dev),
+						pci_name(pdev),
+						doe_mb->cap_offset);
+	if (!doe_mb->work_queue) {
+		pci_err(pdev, "[%x] failed to allocate work queue\n",
+			doe_mb->cap_offset);
+		return ERR_PTR(-ENOMEM);
+	}
+	rc = devm_add_action_or_reset(dev, pci_doe_destroy_workqueue, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* Reset the mailbox by issuing an abort */
+	rc = pci_doe_abort(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to reset mailbox with abort command : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The state machine and the mailbox should be in sync now;
+	 * Set up mailbox flush prior to using the mailbox to query protocols.
+	 */
+	rc = devm_add_action_or_reset(dev, pci_doe_flush_mb, doe_mb);
+	if (rc)
+		return ERR_PTR(rc);
+
+	rc = pci_doe_cache_protocols(doe_mb);
+	if (rc) {
+		pci_err(pdev, "[%x] failed to cache protocols : %d\n",
+			doe_mb->cap_offset, rc);
+		return ERR_PTR(rc);
+	}
+
+	return doe_mb;
+}
+EXPORT_SYMBOL_GPL(pcim_doe_create_mb);
+
+/**
+ * pci_doe_supports_prot() - Return if the DOE instance supports the given
+ *			     protocol
+ * @doe_mb: DOE mailbox capability to query
+ * @vid: Protocol Vendor ID
+ * @type: Protocol type
+ *
+ * RETURNS: True if the DOE mailbox supports the protocol specified
+ */
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type)
+{
+	unsigned long index;
+	void *entry;
+
+	/* The discovery protocol must always be supported */
+	if (vid == PCI_VENDOR_ID_PCI_SIG && type == PCI_DOE_PROTOCOL_DISCOVERY)
+		return true;
+
+	xa_for_each(&doe_mb->prots, index, entry)
+		if (entry == pci_doe_xa_prot_entry(vid, type))
+			return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(pci_doe_supports_prot);
+
+/**
+ * pci_doe_submit_task() - Submit a task to be processed by the state machine
+ *
+ * @doe_mb: DOE mailbox capability to submit to
+ * @task: task to be queued
+ *
+ * Submit a DOE task (request/response) to the DOE mailbox to be processed.
+ * Returns upon queueing the task object.  If the queue is full this function
+ * will sleep until there is room in the queue.
+ *
+ * task->complete will be called when the state machine is done processing this
+ * task.
+ *
+ * Excess data will be discarded.
+ *
+ * RETURNS: 0 when task has been successfully queued, -ERRNO on error
+ */
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task)
+{
+	if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type))
+		return -EINVAL;
+
+	/*
+	 * DOE requests must be a whole number of DW and the response needs to
+	 * be big enough for at least 1 DW
+	 */
+	if (task->request_pl_sz % sizeof(u32) ||
+	    task->response_pl_sz < sizeof(u32))
+		return -EINVAL;
+
+	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
+		return -EIO;
+
+	task->doe_mb = doe_mb;
+	INIT_WORK(&task->work, doe_statemachine_work);
+	queue_work(doe_mb->work_queue, &task->work);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_doe_submit_task);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 9884d8b29d3b..c5286b027f00 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2315,7 +2315,7 @@ EXPORT_SYMBOL(pci_alloc_dev);
 
 static bool pci_bus_crs_vendor_id(u32 l)
 {
-	return (l & 0xffff) == 0x0001;
+	return (l & 0xffff) == PCI_VENDOR_ID_PCI_SIG;
 }
 
 static bool pci_bus_wait_crs(struct pci_bus *bus, int devfn, u32 *l,
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index ec5f71f7135b..616b683563a9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -141,6 +141,7 @@ enum {
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
 	IORES_DESC_RESERVED			= 7,
 	IORES_DESC_SOFT_RESERVED		= 8,
+	IORES_DESC_CXL				= 9,
 };
 
 /*
@@ -329,6 +330,8 @@ struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size);
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name);
+struct resource *alloc_free_mem_region(struct resource *base,
+		unsigned long size, unsigned long align, const char *name);
 
 static inline void irqresource_disabled(struct resource *res, u32 irq)
 {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 0d61e07b6827..c74acfa1a3fe 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -59,6 +59,9 @@ enum {
 	/* Platform provides asynchronous flush mechanism */
 	ND_REGION_ASYNC = 3,
 
+	/* Region was created by CXL subsystem */
+	ND_REGION_CXL = 4,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -122,6 +125,7 @@ struct nd_region_desc {
 	int numa_node;
 	int target_node;
 	unsigned long flags;
+	int memregion;
 	struct device_node *of_node;
 	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
@@ -259,6 +263,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 			cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL);
 }
 void nvdimm_delete(struct nvdimm *nvdimm);
+void nvdimm_region_delete(struct nd_region *nd_region);
 
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
new file mode 100644
index 000000000000..ed9b4df792b8
--- /dev/null
+++ b/include/linux/pci-doe.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Data Object Exchange
+ *	PCIe r6.0, sec 6.30 DOE
+ *
+ * Copyright (C) 2021 Huawei
+ *     Jonathan Cameron <Jonathan.Cameron@huawei.com>
+ *
+ * Copyright (C) 2022 Intel Corporation
+ *	Ira Weiny <ira.weiny@intel.com>
+ */
+
+#ifndef LINUX_PCI_DOE_H
+#define LINUX_PCI_DOE_H
+
+struct pci_doe_protocol {
+	u16 vid;
+	u8 type;
+};
+
+struct pci_doe_mb;
+
+/**
+ * struct pci_doe_task - represents a single query/response
+ *
+ * @prot: DOE Protocol
+ * @request_pl: The request payload
+ * @request_pl_sz: Size of the request payload (bytes)
+ * @response_pl: The response payload
+ * @response_pl_sz: Size of the response payload (bytes)
+ * @rv: Return value.  Length of received response or error (bytes)
+ * @complete: Called when task is complete
+ * @private: Private data for the consumer
+ * @work: Used internally by the mailbox
+ * @doe_mb: Used internally by the mailbox
+ *
+ * The payload sizes and rv are specified in bytes with the following
+ * restrictions concerning the protocol.
+ *
+ *	1) The request_pl_sz must be a multiple of double words (4 bytes)
+ *	2) The response_pl_sz must be >= a single double word (4 bytes)
+ *	3) rv is returned as bytes but it will be a multiple of double words
+ *
+ * NOTE there is no need for the caller to initialize work or doe_mb.
+ */
+struct pci_doe_task {
+	struct pci_doe_protocol prot;
+	u32 *request_pl;
+	size_t request_pl_sz;
+	u32 *response_pl;
+	size_t response_pl_sz;
+	int rv;
+	void (*complete)(struct pci_doe_task *task);
+	void *private;
+
+	/* No need for the user to initialize these fields */
+	struct work_struct work;
+	struct pci_doe_mb *doe_mb;
+};
+
+/**
+ * pci_doe_for_each_off - Iterate each DOE capability
+ * @pdev: struct pci_dev to iterate
+ * @off: u16 of config space offset of each mailbox capability found
+ */
+#define pci_doe_for_each_off(pdev, off) \
+	for (off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE); \
+		off > 0; \
+		off = pci_find_next_ext_capability(pdev, off, \
+					PCI_EXT_CAP_ID_DOE))
+
+struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset);
+bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type);
+int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task);
+
+#endif
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 7fa460ccf7fa..6feade66efdb 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -151,6 +151,7 @@
 #define PCI_CLASS_OTHERS		0xff
 
 /* Vendors and devices.  Sort key: vendor first, device next. */
+#define PCI_VENDOR_ID_PCI_SIG		0x0001
 
 #define PCI_VENDOR_ID_LOONGSON		0x0014
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e3f1e8ac1f85..fd3fe5c8c17f 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -235,6 +235,22 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size)
 #define BIN_ATTR_RW(_name, _size)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
 
+
+#define __BIN_ATTR_ADMIN_RO(_name, _size) {					\
+	.attr	= { .name = __stringify(_name), .mode = 0400 },		\
+	.read	= _name##_read,						\
+	.size	= _size,						\
+}
+
+#define __BIN_ATTR_ADMIN_RW(_name, _size)					\
+	__BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size)
+
+#define BIN_ATTR_ADMIN_RO(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)
+
+#define BIN_ATTR_ADMIN_RW(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 108f8523fa04..57b8e2ffb1dd 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -737,7 +737,8 @@
 #define PCI_EXT_CAP_ID_DVSEC	0x23	/* Designated Vendor-Specific */
 #define PCI_EXT_CAP_ID_DLF	0x25	/* Data Link Feature */
 #define PCI_EXT_CAP_ID_PL_16GT	0x26	/* Physical Layer 16.0 GT/s */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PL_16GT
+#define PCI_EXT_CAP_ID_DOE	0x2E	/* Data Object Exchange */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DOE
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -1103,4 +1104,30 @@
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK		0x000000F0
 #define  PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT	4
 
+/* Data Object Exchange */
+#define PCI_DOE_CAP		0x04    /* DOE Capabilities Register */
+#define  PCI_DOE_CAP_INT_SUP			0x00000001  /* Interrupt Support */
+#define  PCI_DOE_CAP_INT_MSG_NUM		0x00000ffe  /* Interrupt Message Number */
+#define PCI_DOE_CTRL		0x08    /* DOE Control Register */
+#define  PCI_DOE_CTRL_ABORT			0x00000001  /* DOE Abort */
+#define  PCI_DOE_CTRL_INT_EN			0x00000002  /* DOE Interrupt Enable */
+#define  PCI_DOE_CTRL_GO			0x80000000  /* DOE Go */
+#define PCI_DOE_STATUS		0x0c    /* DOE Status Register */
+#define  PCI_DOE_STATUS_BUSY			0x00000001  /* DOE Busy */
+#define  PCI_DOE_STATUS_INT_STATUS		0x00000002  /* DOE Interrupt Status */
+#define  PCI_DOE_STATUS_ERROR			0x00000004  /* DOE Error */
+#define  PCI_DOE_STATUS_DATA_OBJECT_READY	0x80000000  /* Data Object Ready */
+#define PCI_DOE_WRITE		0x10    /* DOE Write Data Mailbox Register */
+#define PCI_DOE_READ		0x14    /* DOE Read Data Mailbox Register */
+
+/* DOE Data Object - note not actually registers */
+#define PCI_DOE_DATA_OBJECT_HEADER_1_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_HEADER_1_TYPE		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH		0x0003ffff
+
+#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX		0x000000ff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID		0x0000ffff
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
+#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
+
 #endif /* LINUX_PCI_REGS_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 34eaee179689..4c5e80b92f2f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(page_is_ram);
 
-static int __region_intersects(resource_size_t start, size_t size,
-			unsigned long flags, unsigned long desc)
+static int __region_intersects(struct resource *parent, resource_size_t start,
+			       size_t size, unsigned long flags,
+			       unsigned long desc)
 {
 	struct resource res;
 	int type = 0; int other = 0;
@@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size,
 	res.start = start;
 	res.end = start + size - 1;
 
-	for (p = iomem_resource.child; p ; p = p->sibling) {
+	for (p = parent->child; p ; p = p->sibling) {
 		bool is_type = (((p->flags & flags) == flags) &&
 				((desc == IORES_DESC_NONE) ||
 				 (desc == p->desc)));
@@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags,
 	int ret;
 
 	read_lock(&resource_lock);
-	ret = __region_intersects(start, size, flags, desc);
+	ret = __region_intersects(&iomem_resource, start, size, flags, desc);
 	read_unlock(&resource_lock);
 
 	return ret;
@@ -891,6 +892,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
 	}
 	write_unlock(&resource_lock);
 }
+/*
+ * Not for general consumption, only early boot memory map parsing, PCI
+ * resource discovery, and late discovery of CXL resources are expected
+ * to use this interface. The former are built-in and only the latter,
+ * CXL, is a module.
+ */
+EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL);
 
 /**
  * remove_resource - Remove a resource in the resource tree
@@ -1773,62 +1781,139 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
-#ifdef CONFIG_DEVICE_PRIVATE
-static struct resource *__request_free_mem_region(struct device *dev,
-		struct resource *base, unsigned long size, const char *name)
+#ifdef CONFIG_GET_FREE_REGION
+#define GFR_DESCENDING		(1UL << 0)
+#define GFR_REQUEST_REGION	(1UL << 1)
+#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT)
+
+static resource_size_t gfr_start(struct resource *base, resource_size_t size,
+				 resource_size_t align, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING) {
+		resource_size_t end;
+
+		end = min_t(resource_size_t, base->end,
+			    (1ULL << MAX_PHYSMEM_BITS) - 1);
+		return end - size + 1;
+	}
+
+	return ALIGN(base->start, align);
+}
+
+static bool gfr_continue(struct resource *base, resource_size_t addr,
+			 resource_size_t size, unsigned long flags)
+{
+	if (flags & GFR_DESCENDING)
+		return addr > size && addr >= base->start;
+	/*
+	 * In the ascend case be careful that the last increment by
+	 * @size did not wrap 0.
+	 */
+	return addr > addr - size &&
+	       addr <= min_t(resource_size_t, base->end,
+			     (1ULL << MAX_PHYSMEM_BITS) - 1);
+}
+
+static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
+				unsigned long flags)
 {
-	resource_size_t end, addr;
+	if (flags & GFR_DESCENDING)
+		return addr - size;
+	return addr + size;
+}
+
+static void remove_free_mem_region(void *_res)
+{
+	struct resource *res = _res;
+
+	if (res->parent)
+		remove_resource(res);
+	free_resource(res);
+}
+
+static struct resource *
+get_free_mem_region(struct device *dev, struct resource *base,
+		    resource_size_t size, const unsigned long align,
+		    const char *name, const unsigned long desc,
+		    const unsigned long flags)
+{
+	resource_size_t addr;
 	struct resource *res;
 	struct region_devres *dr = NULL;
 
-	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
-	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = end - size + 1UL;
+	size = ALIGN(size, align);
 
 	res = alloc_resource(GFP_KERNEL);
 	if (!res)
 		return ERR_PTR(-ENOMEM);
 
-	if (dev) {
+	if (dev && (flags & GFR_REQUEST_REGION)) {
 		dr = devres_alloc(devm_region_release,
 				sizeof(struct region_devres), GFP_KERNEL);
 		if (!dr) {
 			free_resource(res);
 			return ERR_PTR(-ENOMEM);
 		}
+	} else if (dev) {
+		if (devm_add_action_or_reset(dev, remove_free_mem_region, res))
+			return ERR_PTR(-ENOMEM);
 	}
 
 	write_lock(&resource_lock);
-	for (; addr > size && addr >= base->start; addr -= size) {
-		if (__region_intersects(addr, size, 0, IORES_DESC_NONE) !=
-				REGION_DISJOINT)
+	for (addr = gfr_start(base, size, align, flags);
+	     gfr_continue(base, addr, size, flags);
+	     addr = gfr_next(addr, size, flags)) {
+		if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) !=
+		    REGION_DISJOINT)
 			continue;
 
-		if (__request_region_locked(res, &iomem_resource, addr, size,
-						name, 0))
-			break;
+		if (flags & GFR_REQUEST_REGION) {
+			if (__request_region_locked(res, &iomem_resource, addr,
+						    size, name, 0))
+				break;
 
-		if (dev) {
-			dr->parent = &iomem_resource;
-			dr->start = addr;
-			dr->n = size;
-			devres_add(dev, dr);
-		}
+			if (dev) {
+				dr->parent = &iomem_resource;
+				dr->start = addr;
+				dr->n = size;
+				devres_add(dev, dr);
+			}
 
-		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
-		write_unlock(&resource_lock);
+			res->desc = desc;
+			write_unlock(&resource_lock);
+
+
+			/*
+			 * A driver is claiming this region so revoke any
+			 * mappings.
+			 */
+			revoke_iomem(res);
+		} else {
+			res->start = addr;
+			res->end = addr + size - 1;
+			res->name = name;
+			res->desc = desc;
+			res->flags = IORESOURCE_MEM;
+
+			/*
+			 * Only succeed if the resource hosts an exclusive
+			 * range after the insert
+			 */
+			if (__insert_resource(base, res) || res->child)
+				break;
+
+			write_unlock(&resource_lock);
+		}
 
-		/*
-		 * A driver is claiming this region so revoke any mappings.
-		 */
-		revoke_iomem(res);
 		return res;
 	}
 	write_unlock(&resource_lock);
 
-	free_resource(res);
-	if (dr)
+	if (flags & GFR_REQUEST_REGION) {
+		free_resource(res);
 		devres_free(dr);
+	} else if (dev)
+		devm_release_action(dev, remove_free_mem_region, res);
 
 	return ERR_PTR(-ERANGE);
 }
@@ -1847,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev,
 struct resource *devm_request_free_mem_region(struct device *dev,
 		struct resource *base, unsigned long size)
 {
-	return __request_free_mem_region(dev, base, size, dev_name(dev));
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN,
+				   dev_name(dev),
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
 
 struct resource *request_free_mem_region(struct resource *base,
 		unsigned long size, const char *name)
 {
-	return __request_free_mem_region(NULL, base, size, name);
+	unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION;
+
+	return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name,
+				   IORES_DESC_DEVICE_PRIVATE_MEMORY, flags);
 }
 EXPORT_SYMBOL_GPL(request_free_mem_region);
 
-#endif /* CONFIG_DEVICE_PRIVATE */
+/**
+ * alloc_free_mem_region - find a free region relative to @base
+ * @base: resource that will parent the new resource
+ * @size: size in bytes of memory to allocate from @base
+ * @align: alignment requirements for the allocation
+ * @name: resource name
+ *
+ * Buses like CXL, that can dynamically instantiate new memory regions,
+ * need a method to allocate physical address space for those regions.
+ * Allocate and insert a new resource to cover a free, unclaimed by a
+ * descendant of @base, range in the span of @base.
+ */
+struct resource *alloc_free_mem_region(struct resource *base,
+				       unsigned long size, unsigned long align,
+				       const char *name)
+{
+	/* Default of ascending direction and insert resource */
+	unsigned long flags = 0;
+
+	return get_free_mem_region(NULL, base, size, align, name,
+				   IORES_DESC_NONE, flags);
+}
+EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL);
+#endif /* CONFIG_GET_FREE_REGION */
 
 static int __init strict_iomem(char *str)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index e59cf5fe5ce9..0331f1461f81 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -983,9 +983,14 @@ config HMM_MIRROR
 	bool
 	depends on MMU
 
+config GET_FREE_REGION
+	depends on SPARSEMEM
+	bool
+
 config DEVICE_PRIVATE
 	bool "Unaddressable device memory (GPU memory, ...)"
 	depends on ZONE_DEVICE
+	select GET_FREE_REGION
 
 	help
 	  Allows creation of struct pages to represent unaddressable device
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index 33543231d453..500be85729cc 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -47,6 +47,7 @@ cxl_core-y += $(CXL_CORE_SRC)/memdev.o
 cxl_core-y += $(CXL_CORE_SRC)/mbox.o
 cxl_core-y += $(CXL_CORE_SRC)/pci.o
 cxl_core-y += $(CXL_CORE_SRC)/hdm.o
+cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-y += config_check.o
 
 obj-m += test/
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 431f2bddf6c8..a072b2d3e726 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -14,7 +14,7 @@
 #define NR_CXL_HOST_BRIDGES 2
 #define NR_CXL_ROOT_PORTS 2
 #define NR_CXL_SWITCH_PORTS 2
-#define NR_CXL_PORT_DECODERS 2
+#define NR_CXL_PORT_DECODERS 8
 
 static struct platform_device *cxl_acpi;
 static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES];
@@ -118,7 +118,7 @@ static struct {
 			.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
 					ACPI_CEDT_CFMWS_RESTRICT_VOLATILE,
 			.qtg_id = 0,
-			.window_size = SZ_256M,
+			.window_size = SZ_256M * 4UL,
 		},
 		.target = { 0 },
 	},
@@ -133,7 +133,7 @@ static struct {
 			.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
 					ACPI_CEDT_CFMWS_RESTRICT_VOLATILE,
 			.qtg_id = 1,
-			.window_size = SZ_256M * 2,
+			.window_size = SZ_256M * 8UL,
 		},
 		.target = { 0, 1, },
 	},
@@ -148,7 +148,7 @@ static struct {
 			.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
 					ACPI_CEDT_CFMWS_RESTRICT_PMEM,
 			.qtg_id = 2,
-			.window_size = SZ_256M,
+			.window_size = SZ_256M * 4UL,
 		},
 		.target = { 0 },
 	},
@@ -163,7 +163,7 @@ static struct {
 			.restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 |
 					ACPI_CEDT_CFMWS_RESTRICT_PMEM,
 			.qtg_id = 3,
-			.window_size = SZ_256M * 2,
+			.window_size = SZ_256M * 8UL,
 		},
 		.target = { 0, 1, },
 	},
@@ -429,6 +429,50 @@ static int map_targets(struct device *dev, void *data)
 	return 0;
 }
 
+static int mock_decoder_commit(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	int id = cxld->id;
+
+	if (cxld->flags & CXL_DECODER_F_ENABLE)
+		return 0;
+
+	dev_dbg(&port->dev, "%s commit\n", dev_name(&cxld->dev));
+	if (port->commit_end + 1 != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order commit, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id, port->commit_end + 1);
+		return -EBUSY;
+	}
+
+	port->commit_end++;
+	cxld->flags |= CXL_DECODER_F_ENABLE;
+
+	return 0;
+}
+
+static int mock_decoder_reset(struct cxl_decoder *cxld)
+{
+	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+	int id = cxld->id;
+
+	if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
+		return 0;
+
+	dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev));
+	if (port->commit_end != id) {
+		dev_dbg(&port->dev,
+			"%s: out of order reset, expected decoder%d.%d\n",
+			dev_name(&cxld->dev), port->id, port->commit_end);
+		return -EBUSY;
+	}
+
+	port->commit_end--;
+	cxld->flags &= ~CXL_DECODER_F_ENABLE;
+
+	return 0;
+}
+
 static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 {
 	struct cxl_port *port = cxlhdm->port;
@@ -451,25 +495,39 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm)
 		struct cxl_decoder *cxld;
 		int rc;
 
-		if (target_count)
-			cxld = cxl_switch_decoder_alloc(port, target_count);
-		else
-			cxld = cxl_endpoint_decoder_alloc(port);
-		if (IS_ERR(cxld)) {
-			dev_warn(&port->dev,
-				 "Failed to allocate the decoder\n");
-			return PTR_ERR(cxld);
+		if (target_count) {
+			struct cxl_switch_decoder *cxlsd;
+
+			cxlsd = cxl_switch_decoder_alloc(port, target_count);
+			if (IS_ERR(cxlsd)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate the decoder\n");
+				return PTR_ERR(cxlsd);
+			}
+			cxld = &cxlsd->cxld;
+		} else {
+			struct cxl_endpoint_decoder *cxled;
+
+			cxled = cxl_endpoint_decoder_alloc(port);
+
+			if (IS_ERR(cxled)) {
+				dev_warn(&port->dev,
+					 "Failed to allocate the decoder\n");
+				return PTR_ERR(cxled);
+			}
+			cxld = &cxled->cxld;
 		}
 
-		cxld->decoder_range = (struct range) {
+		cxld->hpa_range = (struct range) {
 			.start = 0,
 			.end = -1,
 		};
 
-		cxld->flags = CXL_DECODER_F_ENABLE;
 		cxld->interleave_ways = min_not_zero(target_count, 1);
 		cxld->interleave_granularity = SZ_4K;
 		cxld->target_type = CXL_DECODER_EXPANDER;
+		cxld->commit = mock_decoder_commit;
+		cxld->reset = mock_decoder_reset;
 
 		if (target_count) {
 			rc = device_for_each_child(port->uport, &ctx,
@@ -569,44 +627,6 @@ static void mock_companion(struct acpi_device *adev, struct device *dev)
 #define SZ_512G (SZ_64G * 8)
 #endif
 
-static struct platform_device *alloc_memdev(int id)
-{
-	struct resource res[] = {
-		[0] = {
-			.flags = IORESOURCE_MEM,
-		},
-		[1] = {
-			.flags = IORESOURCE_MEM,
-			.desc = IORES_DESC_PERSISTENT_MEMORY,
-		},
-	};
-	struct platform_device *pdev;
-	int i, rc;
-
-	for (i = 0; i < ARRAY_SIZE(res); i++) {
-		struct cxl_mock_res *r = alloc_mock_res(SZ_256M);
-
-		if (!r)
-			return NULL;
-		res[i].start = r->range.start;
-		res[i].end = r->range.end;
-	}
-
-	pdev = platform_device_alloc("cxl_mem", id);
-	if (!pdev)
-		return NULL;
-
-	rc = platform_device_add_resources(pdev, res, ARRAY_SIZE(res));
-	if (rc)
-		goto err;
-
-	return pdev;
-
-err:
-	platform_device_put(pdev);
-	return NULL;
-}
-
 static __init int cxl_test_init(void)
 {
 	int rc, i;
@@ -619,7 +639,8 @@ static __init int cxl_test_init(void)
 		goto err_gen_pool_create;
 	}
 
-	rc = gen_pool_add(cxl_mock_pool, SZ_512G, SZ_64G, NUMA_NO_NODE);
+	rc = gen_pool_add(cxl_mock_pool, iomem_resource.end + 1 - SZ_64G,
+			  SZ_64G, NUMA_NO_NODE);
 	if (rc)
 		goto err_gen_pool_add;
 
@@ -708,7 +729,7 @@ static __init int cxl_test_init(void)
 		struct platform_device *dport = cxl_switch_dport[i];
 		struct platform_device *pdev;
 
-		pdev = alloc_memdev(i);
+		pdev = platform_device_alloc("cxl_mem", i);
 		if (!pdev)
 			goto err_mem;
 		pdev->dev.parent = &dport->dev;
diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c
index 6b9239b2afd4..aa2df3a15051 100644
--- a/tools/testing/cxl/test/mem.c
+++ b/tools/testing/cxl/test/mem.c
@@ -10,6 +10,7 @@
 #include <cxlmem.h>
 
 #define LSA_SIZE SZ_128K
+#define DEV_SIZE SZ_2G
 #define EFFECT(x) (1U << x)
 
 static struct cxl_cel_entry mock_cel[] = {
@@ -26,6 +27,10 @@ static struct cxl_cel_entry mock_cel[] = {
 		.effect = cpu_to_le16(0),
 	},
 	{
+		.opcode = cpu_to_le16(CXL_MBOX_OP_GET_PARTITION_INFO),
+		.effect = cpu_to_le16(0),
+	},
+	{
 		.opcode = cpu_to_le16(CXL_MBOX_OP_SET_LSA),
 		.effect = cpu_to_le16(EFFECT(1) | EFFECT(2)),
 	},
@@ -97,42 +102,37 @@ static int mock_get_log(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 
 static int mock_id(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
 {
-	struct platform_device *pdev = to_platform_device(cxlds->dev);
 	struct cxl_mbox_identify id = {
 		.fw_revision = { "mock fw v1 " },
 		.lsa_size = cpu_to_le32(LSA_SIZE),
-		/* FIXME: Add partition support */
-		.partition_align = cpu_to_le64(0),
+		.partition_align =
+			cpu_to_le64(SZ_256M / CXL_CAPACITY_MULTIPLIER),
+		.total_capacity =
+			cpu_to_le64(DEV_SIZE / CXL_CAPACITY_MULTIPLIER),
 	};
-	u64 capacity = 0;
-	int i;
 
 	if (cmd->size_out < sizeof(id))
 		return -EINVAL;
 
-	for (i = 0; i < 2; i++) {
-		struct resource *res;
-
-		res = platform_get_resource(pdev, IORESOURCE_MEM, i);
-		if (!res)
-			break;
-
-		capacity += resource_size(res) / CXL_CAPACITY_MULTIPLIER;
+	memcpy(cmd->payload_out, &id, sizeof(id));
 
-		if (le64_to_cpu(id.partition_align))
-			continue;
+	return 0;
+}
 
-		if (res->desc == IORES_DESC_PERSISTENT_MEMORY)
-			id.persistent_capacity = cpu_to_le64(
-				resource_size(res) / CXL_CAPACITY_MULTIPLIER);
-		else
-			id.volatile_capacity = cpu_to_le64(
-				resource_size(res) / CXL_CAPACITY_MULTIPLIER);
-	}
+static int mock_partition_info(struct cxl_dev_state *cxlds,
+			       struct cxl_mbox_cmd *cmd)
+{
+	struct cxl_mbox_get_partition_info pi = {
+		.active_volatile_cap =
+			cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER),
+		.active_persistent_cap =
+			cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER),
+	};
 
-	id.total_capacity = cpu_to_le64(capacity);
+	if (cmd->size_out < sizeof(pi))
+		return -EINVAL;
 
-	memcpy(cmd->payload_out, &id, sizeof(id));
+	memcpy(cmd->payload_out, &pi, sizeof(pi));
 
 	return 0;
 }
@@ -221,6 +221,9 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *
 	case CXL_MBOX_OP_GET_LSA:
 		rc = mock_get_lsa(cxlds, cmd);
 		break;
+	case CXL_MBOX_OP_GET_PARTITION_INFO:
+		rc = mock_partition_info(cxlds, cmd);
+		break;
 	case CXL_MBOX_OP_SET_LSA:
 		rc = mock_set_lsa(cxlds, cmd);
 		break;
@@ -282,7 +285,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev)
 	if (IS_ERR(cxlmd))
 		return PTR_ERR(cxlmd);
 
-	if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
+	if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM))
 		rc = devm_cxl_add_nvdimm(dev, cxlmd);
 
 	return 0;
diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c
index f1f8c40948c5..bce6a21df0d5 100644
--- a/tools/testing/cxl/test/mock.c
+++ b/tools/testing/cxl/test/mock.c
@@ -208,13 +208,15 @@ int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(__wrap_cxl_await_media_ready, CXL);
 
-bool __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
-				struct cxl_hdm *cxlhdm)
+int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds,
+			       struct cxl_hdm *cxlhdm)
 {
 	int rc = 0, index;
 	struct cxl_mock_ops *ops = get_cxl_mock_ops(&index);
 
-	if (!ops || !ops->is_mock_dev(cxlds->dev))
+	if (ops && ops->is_mock_dev(cxlds->dev))
+		rc = 0;
+	else
 		rc = cxl_hdm_decode_init(cxlds, cxlhdm);
 	put_cxl_mock_ops(index);
author	Linus Torvalds <torvalds@linux-foundation.org>	2022-08-10 11:07:26 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2022-08-10 11:07:26 -0700
commit	c235698355fa94df7073b51befda7d4be00a0e23 (patch)
tree	3cb9d2fa40862f2484eda9c148b4326505a8534d
parent	5e2e7383b57fa03ec2b00c82bb7f49a4a707c1f7 (diff)
parent	1cd8a2537eb07751d405ab7e2223f20338a90506 (diff)