diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-10 11:07:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-10 11:07:26 -0700 |
commit | c235698355fa94df7073b51befda7d4be00a0e23 (patch) | |
tree | 3cb9d2fa40862f2484eda9c148b4326505a8534d | |
parent | 5e2e7383b57fa03ec2b00c82bb7f49a4a707c1f7 (diff) | |
parent | 1cd8a2537eb07751d405ab7e2223f20338a90506 (diff) |
Merge tag 'cxl-for-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
Pull cxl updates from Dan Williams:
"Compute Express Link (CXL) updates for 6.0:
- Introduce a 'struct cxl_region' object with support for
provisioning and assembling persistent memory regions.
- Introduce alloc_free_mem_region() to accompany the existing
request_free_mem_region() as a method to allocate physical memory
capacity out of an existing resource.
- Export insert_resource_expand_to_fit() for the CXL subsystem to
late-publish CXL platform windows in iomem_resource.
- Add a polled mode PCI DOE (Data Object Exchange) driver service and
use it in cxl_pci to retrieve the CDAT (Coherent Device Attribute
Table)"
* tag 'cxl-for-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl: (74 commits)
cxl/hdm: Fix skip allocations vs multiple pmem allocations
cxl/region: Disallow region granularity != window granularity
cxl/region: Fix x1 interleave to greater than x1 interleave routing
cxl/region: Move HPA setup to cxl_region_attach()
cxl/region: Fix decoder interleave programming
Documentation: cxl: remove dangling kernel-doc reference
cxl/region: describe targets and nr_targets members of cxl_region_params
cxl/regions: add padding for cxl_rr_ep_add nested lists
cxl/region: Fix IS_ERR() vs NULL check
cxl/region: Fix region reference target accounting
cxl/region: Fix region commit uninitialized variable warning
cxl/region: Fix port setup uninitialized variable warnings
cxl/region: Stop initializing interleave granularity
cxl/hdm: Fix DPA reservation vs cxl_endpoint_decoder lifetime
cxl/acpi: Minimize granularity for x1 interleaves
cxl/region: Delete 'region' attribute from root decoders
cxl/acpi: Autoload driver for 'cxl_acpi' test devices
cxl/region: decrement ->nr_targets on error in cxl_region_attach()
cxl/region: prevent underflow in ways_to_cxl()
cxl/region: uninitialized variable in alloc_hpa()
...
39 files changed, 5498 insertions, 576 deletions
diff --git a/.clang-format b/.clang-format index 9b87ea1fc16e..1247d54f9e49 100644 --- a/.clang-format +++ b/.clang-format @@ -516,6 +516,7 @@ ForEachMacros: - 'of_property_for_each_string' - 'of_property_for_each_u32' - 'pci_bus_for_each_resource' + - 'pci_doe_for_each_off' - 'pcl_for_each_chunk' - 'pcl_for_each_segment' - 'pcm_for_each_format' diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 7c2b846521f3..8494ef27e8d2 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -7,6 +7,7 @@ Description: all descendant memdevs for unbind. Writing '1' to this attribute flushes that work. + What: /sys/bus/cxl/devices/memX/firmware_version Date: December, 2020 KernelVersion: v5.12 @@ -16,6 +17,7 @@ Description: Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/ram/size Date: December, 2020 KernelVersion: v5.12 @@ -25,6 +27,7 @@ Description: identically named field in the Identify Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/pmem/size Date: December, 2020 KernelVersion: v5.12 @@ -34,6 +37,7 @@ Description: identically named field in the Identify Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/serial Date: January, 2022 KernelVersion: v5.18 @@ -43,6 +47,7 @@ Description: capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2 Memory Device PCIe Capabilities and Extended Capabilities. + What: /sys/bus/cxl/devices/memX/numa_node Date: January, 2022 KernelVersion: v5.18 @@ -52,114 +57,334 @@ Description: host PCI device for this memory device, emit the CPU node affinity for this device. + What: /sys/bus/cxl/devices/*/devtype Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL device objects export the devtype attribute which mirrors - the same value communicated in the DEVTYPE environment variable - for uevents for devices on the "cxl" bus. + (RO) CXL device objects export the devtype attribute which + mirrors the same value communicated in the DEVTYPE environment + variable for uevents for devices on the "cxl" bus. + What: /sys/bus/cxl/devices/*/modalias Date: December, 2021 KernelVersion: v5.18 Contact: linux-cxl@vger.kernel.org Description: - CXL device objects export the modalias attribute which mirrors - the same value communicated in the MODALIAS environment variable - for uevents for devices on the "cxl" bus. + (RO) CXL device objects export the modalias attribute which + mirrors the same value communicated in the MODALIAS environment + variable for uevents for devices on the "cxl" bus. + What: /sys/bus/cxl/devices/portX/uport Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL port objects are enumerated from either a platform firmware - device (ACPI0017 and ACPI0016) or PCIe switch upstream port with - CXL component registers. The 'uport' symlink connects the CXL - portX object to the device that published the CXL port + (RO) CXL port objects are enumerated from either a platform + firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream + port with CXL component registers. The 'uport' symlink connects + the CXL portX object to the device that published the CXL port capability. + What: /sys/bus/cxl/devices/portX/dportY Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL port objects are enumerated from either a platform firmware - device (ACPI0017 and ACPI0016) or PCIe switch upstream port with - CXL component registers. The 'dportY' symlink identifies one or - more downstream ports that the upstream port may target in its - decode of CXL memory resources. The 'Y' integer reflects the - hardware port unique-id used in the hardware decoder target - list. + (RO) CXL port objects are enumerated from either a platform + firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream + port with CXL component registers. The 'dportY' symlink + identifies one or more downstream ports that the upstream port + may target in its decode of CXL memory resources. The 'Y' + integer reflects the hardware port unique-id used in the + hardware decoder target list. + What: /sys/bus/cxl/devices/decoderX.Y Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL decoder objects are enumerated from either a platform + (RO) CXL decoder objects are enumerated from either a platform firmware description, or a CXL HDM decoder register set in a PCIe device (see CXL 2.0 section 8.2.5.12 CXL HDM Decoder Capability Structure). The 'X' in decoderX.Y represents the cxl_port container of this decoder, and 'Y' represents the instance id of a given decoder resource. + What: /sys/bus/cxl/devices/decoderX.Y/{start,size} Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - The 'start' and 'size' attributes together convey the physical - address base and number of bytes mapped in the decoder's decode - window. For decoders of devtype "cxl_decoder_root" the address - range is fixed. For decoders of devtype "cxl_decoder_switch" the - address is bounded by the decode range of the cxl_port ancestor - of the decoder's cxl_port, and dynamically updates based on the - active memory regions in that address space. + (RO) The 'start' and 'size' attributes together convey the + physical address base and number of bytes mapped in the + decoder's decode window. For decoders of devtype + "cxl_decoder_root" the address range is fixed. For decoders of + devtype "cxl_decoder_switch" the address is bounded by the + decode range of the cxl_port ancestor of the decoder's cxl_port, + and dynamically updates based on the active memory regions in + that address space. + What: /sys/bus/cxl/devices/decoderX.Y/locked Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL HDM decoders have the capability to lock the configuration - until the next device reset. For decoders of devtype - "cxl_decoder_root" there is no standard facility to unlock them. - For decoders of devtype "cxl_decoder_switch" a secondary bus - reset, of the PCIe bridge that provides the bus for this - decoders uport, unlocks / resets the decoder. + (RO) CXL HDM decoders have the capability to lock the + configuration until the next device reset. For decoders of + devtype "cxl_decoder_root" there is no standard facility to + unlock them. For decoders of devtype "cxl_decoder_switch" a + secondary bus reset, of the PCIe bridge that provides the bus + for this decoders uport, unlocks / resets the decoder. + What: /sys/bus/cxl/devices/decoderX.Y/target_list Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - Display a comma separated list of the current decoder target - configuration. The list is ordered by the current configured - interleave order of the decoder's dport instances. Each entry in - the list is a dport id. + (RO) Display a comma separated list of the current decoder + target configuration. The list is ordered by the current + configured interleave order of the decoder's dport instances. + Each entry in the list is a dport id. + What: /sys/bus/cxl/devices/decoderX.Y/cap_{pmem,ram,type2,type3} Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - When a CXL decoder is of devtype "cxl_decoder_root", it + (RO) When a CXL decoder is of devtype "cxl_decoder_root", it represents a fixed memory window identified by platform firmware. A fixed window may only support a subset of memory types. The 'cap_*' attributes indicate whether persistent memory, volatile memory, accelerator memory, and / or expander memory may be mapped behind this decoder's memory window. + What: /sys/bus/cxl/devices/decoderX.Y/target_type Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - When a CXL decoder is of devtype "cxl_decoder_switch", it can - optionally decode either accelerator memory (type-2) or expander - memory (type-3). The 'target_type' attribute indicates the - current setting which may dynamically change based on what + (RO) When a CXL decoder is of devtype "cxl_decoder_switch", it + can optionally decode either accelerator memory (type-2) or + expander memory (type-3). The 'target_type' attribute indicates + the current setting which may dynamically change based on what memory regions are activated in this decode hierarchy. + + +What: /sys/bus/cxl/devices/endpointX/CDAT +Date: July, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) If this sysfs entry is not present no DOE mailbox was + found to support CDAT data. If it is present and the length of + the data is 0 reading the CDAT data failed. Otherwise the CDAT + data is reported. + + +What: /sys/bus/cxl/devices/decoderX.Y/mode +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it + translates from a host physical address range, to a device local + address range. Device-local address ranges are further split + into a 'ram' (volatile memory) range and 'pmem' (persistent + memory) range. The 'mode' attribute emits one of 'ram', 'pmem', + 'mixed', or 'none'. The 'mixed' indication is for error cases + when a decoder straddles the volatile/persistent partition + boundary, and 'none' indicates the decoder is not actively + decoding, or no DPA allocation policy has been set. + + 'mode' can be written, when the decoder is in the 'disabled' + state, with either 'ram' or 'pmem' to set the boundaries for the + next allocation. + + +What: /sys/bus/cxl/devices/decoderX.Y/dpa_resource +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) When a CXL decoder is of devtype "cxl_decoder_endpoint", + and its 'dpa_size' attribute is non-zero, this attribute + indicates the device physical address (DPA) base address of the + allocation. + + +What: /sys/bus/cxl/devices/decoderX.Y/dpa_size +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it + translates from a host physical address range, to a device local + address range. The range, base address plus length in bytes, of + DPA allocated to this decoder is conveyed in these 2 attributes. + Allocations can be mutated as long as the decoder is in the + disabled state. A write to 'dpa_size' releases the previous DPA + allocation and then attempts to allocate from the free capacity + in the device partition referred to by 'decoderX.Y/mode'. + Allocate and free requests can only be performed on the highest + instance number disabled decoder with non-zero size. I.e. + allocations are enforced to occur in increasing 'decoderX.Y/id' + order and frees are enforced to occur in decreasing + 'decoderX.Y/id' order. + + +What: /sys/bus/cxl/devices/decoderX.Y/interleave_ways +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The number of targets across which this decoder's host + physical address (HPA) memory range is interleaved. The device + maps every Nth block of HPA (of size == + 'interleave_granularity') to consecutive DPA addresses. The + decoder's position in the interleave is determined by the + device's (endpoint or switch) switch ancestry. For root + decoders their interleave is specified by platform firmware and + they only specify a downstream target order for host bridges. + + +What: /sys/bus/cxl/devices/decoderX.Y/interleave_granularity +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The number of consecutive bytes of host physical address + space this decoder claims at address N before the decode rotates + to the next target in the interleave at address N + + interleave_granularity (assuming N is aligned to + interleave_granularity). + + +What: /sys/bus/cxl/devices/decoderX.Y/create_pmem_region +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a string in the form 'regionZ' to start the process + of defining a new persistent memory region (interleave-set) + within the decode range bounded by root decoder 'decoderX.Y'. + The value written must match the current value returned from + reading this attribute. An atomic compare exchange operation is + done on write to assign the requested id to a region and + allocate the region-id for the next creation attempt. EBUSY is + returned if the region name written does not match the current + cached value. + + +What: /sys/bus/cxl/devices/decoderX.Y/delete_region +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (WO) Write a string in the form 'regionZ' to delete that region, + provided it is currently idle / not bound to a driver. + + +What: /sys/bus/cxl/devices/regionZ/uuid +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a unique identifier for the region. This field must + be set for persistent regions and it must not conflict with the + UUID of another region. + + +What: /sys/bus/cxl/devices/regionZ/interleave_granularity +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Set the number of consecutive bytes each device in the + interleave set will claim. The possible interleave granularity + values are determined by the CXL spec and the participating + devices. + + +What: /sys/bus/cxl/devices/regionZ/interleave_ways +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Configures the number of devices participating in the + region is set by writing this value. Each device will provide + 1/interleave_ways of storage for the region. + + +What: /sys/bus/cxl/devices/regionZ/size +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) System physical address space to be consumed by the region. + When written trigger the driver to allocate space out of the + parent root decoder's address space. When read the size of the + address space is reported and should match the span of the + region's resource attribute. Size shall be set after the + interleave configuration parameters. Once set it cannot be + changed, only freed by writing 0. The kernel makes no guarantees + that data is maintained over an address space freeing event, and + there is no guarantee that a free followed by an allocate + results in the same address being allocated. + + +What: /sys/bus/cxl/devices/regionZ/resource +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) A region is a contiguous partition of a CXL root decoder + address space. Region capacity is allocated by writing to the + size attribute, the resulting physical address space determined + by the driver is reflected here. It is therefore not useful to + read this before writing a value to the size attribute. + + +What: /sys/bus/cxl/devices/regionZ/target[0..N] +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write an endpoint decoder object name to 'targetX' where X + is the intended position of the endpoint device in the region + interleave and N is the 'interleave_ways' setting for the + region. ENXIO is returned if the write results in an impossible + to map decode scenario, like the endpoint is unreachable at that + position relative to the root decoder interleave. EBUSY is + returned if the position in the region is already occupied, or + if the region is not in a state to accept interleave + configuration changes. EINVAL is returned if the object name is + not an endpoint decoder. Once all positions have been + successfully written a final validation for decode conflicts is + performed before activating the region. + + +What: /sys/bus/cxl/devices/regionZ/commit +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a boolean 'true' string value to this attribute to + trigger the region to transition from the software programmed + state to the actively decoding in hardware state. The commit + operation in addition to validating that the region is in proper + configured state, validates that the decoders are being + committed in spec mandated order (last committed decoder id + + 1), and checks that the hardware accepts the commit request. + Reading this value indicates whether the region is committed or + not. diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index db476bb170b6..5149ecdc53c7 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -362,6 +362,14 @@ CXL Core .. kernel-doc:: drivers/cxl/core/mbox.c :doc: cxl mbox +CXL Regions +----------- +.. kernel-doc:: drivers/cxl/core/region.c + :doc: cxl core region + +.. kernel-doc:: drivers/cxl/core/region.c + :identifiers: + External Interfaces =================== diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 7b0d286bf9ba..01772e79fd93 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -55,6 +55,7 @@ int memory_add_physaddr_to_nid(u64 start) { return hot_add_scn_to_nid(start); } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif int __weak create_section_mapping(unsigned long start, unsigned long end, diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index f64e3984689f..768ced3d6fe8 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -2,6 +2,7 @@ menuconfig CXL_BUS tristate "CXL (Compute Express Link) Devices Support" depends on PCI + select PCI_DOE help CXL is a bus that is electrically compatible with PCI Express, but layers three protocols on that signalling (CXL.io, CXL.cache, and @@ -102,4 +103,12 @@ config CXL_SUSPEND def_bool y depends on SUSPEND && CXL_MEM +config CXL_REGION + bool + default CXL_BUS + # For MAX_PHYSMEM_BITS + depends on SPARSEMEM + select MEMREGION + select GET_FREE_REGION + endif diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 40286f5df812..fb649683dd3a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -9,10 +9,6 @@ #include "cxlpci.h" #include "cxl.h" -/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ -#define CFMWS_INTERLEAVE_WAYS(x) (1 << (x)->interleave_ways) -#define CFMWS_INTERLEAVE_GRANULARITY(x) ((x)->granularity + 8) - static unsigned long cfmws_to_decoder_flags(int restrictions) { unsigned long flags = CXL_DECODER_F_ENABLE; @@ -34,7 +30,8 @@ static unsigned long cfmws_to_decoder_flags(int restrictions) static int cxl_acpi_cfmws_verify(struct device *dev, struct acpi_cedt_cfmws *cfmws) { - int expected_len; + int rc, expected_len; + unsigned int ways; if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) { dev_err(dev, "CFMWS Unsupported Interleave Arithmetic\n"); @@ -51,14 +48,14 @@ static int cxl_acpi_cfmws_verify(struct device *dev, return -EINVAL; } - if (CFMWS_INTERLEAVE_WAYS(cfmws) > CXL_DECODER_MAX_INTERLEAVE) { - dev_err(dev, "CFMWS Interleave Ways (%d) too large\n", - CFMWS_INTERLEAVE_WAYS(cfmws)); + rc = cxl_to_ways(cfmws->interleave_ways, &ways); + if (rc) { + dev_err(dev, "CFMWS Interleave Ways (%d) invalid\n", + cfmws->interleave_ways); return -EINVAL; } - expected_len = struct_size((cfmws), interleave_targets, - CFMWS_INTERLEAVE_WAYS(cfmws)); + expected_len = struct_size(cfmws, interleave_targets, ways); if (cfmws->header.length < expected_len) { dev_err(dev, "CFMWS length %d less than expected %d\n", @@ -76,6 +73,8 @@ static int cxl_acpi_cfmws_verify(struct device *dev, struct cxl_cfmws_context { struct device *dev; struct cxl_port *root_port; + struct resource *cxl_res; + int id; }; static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, @@ -84,10 +83,14 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, int target_map[CXL_DECODER_MAX_INTERLEAVE]; struct cxl_cfmws_context *ctx = arg; struct cxl_port *root_port = ctx->root_port; + struct resource *cxl_res = ctx->cxl_res; + struct cxl_root_decoder *cxlrd; struct device *dev = ctx->dev; struct acpi_cedt_cfmws *cfmws; struct cxl_decoder *cxld; - int rc, i; + unsigned int ways, i, ig; + struct resource *res; + int rc; cfmws = (struct acpi_cedt_cfmws *) header; @@ -99,19 +102,51 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, return 0; } - for (i = 0; i < CFMWS_INTERLEAVE_WAYS(cfmws); i++) + rc = cxl_to_ways(cfmws->interleave_ways, &ways); + if (rc) + return rc; + rc = cxl_to_granularity(cfmws->granularity, &ig); + if (rc) + return rc; + for (i = 0; i < ways; i++) target_map[i] = cfmws->interleave_targets[i]; - cxld = cxl_root_decoder_alloc(root_port, CFMWS_INTERLEAVE_WAYS(cfmws)); - if (IS_ERR(cxld)) + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + res->name = kasprintf(GFP_KERNEL, "CXL Window %d", ctx->id++); + if (!res->name) + goto err_name; + + res->start = cfmws->base_hpa; + res->end = cfmws->base_hpa + cfmws->window_size - 1; + res->flags = IORESOURCE_MEM; + + /* add to the local resource tracking to establish a sort order */ + rc = insert_resource(cxl_res, res); + if (rc) + goto err_insert; + + cxlrd = cxl_root_decoder_alloc(root_port, ways); + if (IS_ERR(cxlrd)) return 0; + cxld = &cxlrd->cxlsd.cxld; cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions); cxld->target_type = CXL_DECODER_EXPANDER; - cxld->platform_res = (struct resource)DEFINE_RES_MEM(cfmws->base_hpa, - cfmws->window_size); - cxld->interleave_ways = CFMWS_INTERLEAVE_WAYS(cfmws); - cxld->interleave_granularity = CFMWS_INTERLEAVE_GRANULARITY(cfmws); + cxld->hpa_range = (struct range) { + .start = res->start, + .end = res->end, + }; + cxld->interleave_ways = ways; + /* + * Minimize the x1 granularity to advertise support for any + * valid region granularity + */ + if (ways == 1) + ig = CXL_DECODER_MIN_GRANULARITY; + cxld->interleave_granularity = ig; rc = cxl_decoder_add(cxld, target_map); if (rc) @@ -119,15 +154,22 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, else rc = cxl_decoder_autoremove(dev, cxld); if (rc) { - dev_err(dev, "Failed to add decoder for %pr\n", - &cxld->platform_res); + dev_err(dev, "Failed to add decode range [%#llx - %#llx]\n", + cxld->hpa_range.start, cxld->hpa_range.end); return 0; } - dev_dbg(dev, "add: %s node: %d range %pr\n", dev_name(&cxld->dev), - phys_to_target_node(cxld->platform_res.start), - &cxld->platform_res); + dev_dbg(dev, "add: %s node: %d range [%#llx - %#llx]\n", + dev_name(&cxld->dev), + phys_to_target_node(cxld->hpa_range.start), + cxld->hpa_range.start, cxld->hpa_range.end); return 0; + +err_insert: + kfree(res->name); +err_name: + kfree(res); + return -ENOMEM; } __mock struct acpi_device *to_cxl_host_bridge(struct device *host, @@ -175,8 +217,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) if (rc) return rc; - port = devm_cxl_add_port(host, match, dport->component_reg_phys, - root_port); + port = devm_cxl_add_port(host, match, dport->component_reg_phys, dport); if (IS_ERR(port)) return PTR_ERR(port); dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev)); @@ -282,9 +323,127 @@ static void cxl_acpi_lock_reset_class(void *dev) device_lock_reset_class(dev); } +static void del_cxl_resource(struct resource *res) +{ + kfree(res->name); + kfree(res); +} + +static void cxl_set_public_resource(struct resource *priv, struct resource *pub) +{ + priv->desc = (unsigned long) pub; +} + +static struct resource *cxl_get_public_resource(struct resource *priv) +{ + return (struct resource *) priv->desc; +} + +static void remove_cxl_resources(void *data) +{ + struct resource *res, *next, *cxl = data; + + for (res = cxl->child; res; res = next) { + struct resource *victim = cxl_get_public_resource(res); + + next = res->sibling; + remove_resource(res); + + if (victim) { + remove_resource(victim); + kfree(victim); + } + + del_cxl_resource(res); + } +} + +/** + * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource + * @cxl_res: A standalone resource tree where each CXL window is a sibling + * + * Walk each CXL window in @cxl_res and add it to iomem_resource potentially + * expanding its boundaries to ensure that any conflicting resources become + * children. If a window is expanded it may then conflict with a another window + * entry and require the window to be truncated or trimmed. Consider this + * situation: + * + * |-- "CXL Window 0" --||----- "CXL Window 1" -----| + * |--------------- "System RAM" -------------| + * + * ...where platform firmware has established as System RAM resource across 2 + * windows, but has left some portion of window 1 for dynamic CXL region + * provisioning. In this case "Window 0" will span the entirety of the "System + * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end + * of that "System RAM" resource. + */ +static int add_cxl_resources(struct resource *cxl_res) +{ + struct resource *res, *new, *next; + + for (res = cxl_res->child; res; res = next) { + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + new->name = res->name; + new->start = res->start; + new->end = res->end; + new->flags = IORESOURCE_MEM; + new->desc = IORES_DESC_CXL; + + /* + * Record the public resource in the private cxl_res tree for + * later removal. + */ + cxl_set_public_resource(res, new); + + insert_resource_expand_to_fit(&iomem_resource, new); + + next = res->sibling; + while (next && resource_overlaps(new, next)) { + if (resource_contains(new, next)) { + struct resource *_next = next->sibling; + + remove_resource(next); + del_cxl_resource(next); + next = _next; + } else + next->start = new->end + 1; + } + } + return 0; +} + +static int pair_cxl_resource(struct device *dev, void *data) +{ + struct resource *cxl_res = data; + struct resource *p; + + if (!is_root_decoder(dev)) + return 0; + + for (p = cxl_res->child; p; p = p->sibling) { + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct resource res = { + .start = cxld->hpa_range.start, + .end = cxld->hpa_range.end, + .flags = IORESOURCE_MEM, + }; + + if (resource_contains(p, &res)) { + cxlrd->res = cxl_get_public_resource(p); + break; + } + } + + return 0; +} + static int cxl_acpi_probe(struct platform_device *pdev) { int rc; + struct resource *cxl_res; struct cxl_port *root_port; struct device *host = &pdev->dev; struct acpi_device *adev = ACPI_COMPANION(host); @@ -296,6 +455,14 @@ static int cxl_acpi_probe(struct platform_device *pdev) if (rc) return rc; + cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL); + if (!cxl_res) + return -ENOMEM; + cxl_res->name = "CXL mem"; + cxl_res->start = 0; + cxl_res->end = -1; + cxl_res->flags = IORESOURCE_MEM; + root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); if (IS_ERR(root_port)) return PTR_ERR(root_port); @@ -306,11 +473,28 @@ static int cxl_acpi_probe(struct platform_device *pdev) if (rc < 0) return rc; + rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res); + if (rc) + return rc; + ctx = (struct cxl_cfmws_context) { .dev = host, .root_port = root_port, + .cxl_res = cxl_res, }; - acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx); + rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx); + if (rc < 0) + return -ENXIO; + + rc = add_cxl_resources(cxl_res); + if (rc) + return rc; + + /* + * Populate the root decoders with their related iomem resource, + * if present + */ + device_for_each_child(&root_port->dev, cxl_res, pair_cxl_resource); /* * Root level scanned with host-bridge as dports, now scan host-bridges @@ -337,12 +521,19 @@ static const struct acpi_device_id cxl_acpi_ids[] = { }; MODULE_DEVICE_TABLE(acpi, cxl_acpi_ids); +static const struct platform_device_id cxl_test_ids[] = { + { "cxl_acpi" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, cxl_test_ids); + static struct platform_driver cxl_acpi_driver = { .probe = cxl_acpi_probe, .driver = { .name = KBUILD_MODNAME, .acpi_match_table = cxl_acpi_ids, }, + .id_table = cxl_test_ids, }; module_platform_driver(cxl_acpi_driver); diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 9d35085d25af..79c7257f4107 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -10,3 +10,4 @@ cxl_core-y += memdev.o cxl_core-y += mbox.o cxl_core-y += pci.o cxl_core-y += hdm.o +cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1a50c0fc399c..1d8f87be283f 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -9,6 +9,36 @@ extern const struct device_type cxl_nvdimm_type; extern struct attribute_group cxl_base_attribute_group; +#ifdef CONFIG_CXL_REGION +extern struct device_attribute dev_attr_create_pmem_region; +extern struct device_attribute dev_attr_delete_region; +extern struct device_attribute dev_attr_region; +extern const struct device_type cxl_pmem_region_type; +extern const struct device_type cxl_region_type; +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); +#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) +#define CXL_REGION_TYPE(x) (&cxl_region_type) +#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), +#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type) +int cxl_region_init(void); +void cxl_region_exit(void); +#else +static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ +} +static inline int cxl_region_init(void) +{ + return 0; +} +static inline void cxl_region_exit(void) +{ +} +#define CXL_REGION_ATTR(x) NULL +#define CXL_REGION_TYPE(x) NULL +#define SET_CXL_REGION_ATTR(x) +#define CXL_PMEM_REGION_TYPE(x) NULL +#endif + struct cxl_send_command; struct cxl_mem_query_commands; int cxl_query_cmd(struct cxl_memdev *cxlmd, @@ -17,9 +47,28 @@ int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s); void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, resource_size_t length); +struct dentry *cxl_debugfs_create_dir(const char *dir); +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode); +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled); +extern struct rw_semaphore cxl_dpa_rwsem; + +bool is_switch_decoder(struct device *dev); +struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); +static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port, + struct cxl_memdev *cxlmd) +{ + if (!port) + return NULL; + + return xa_load(&port->endpoints, (unsigned long)&cxlmd->dev); +} + int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); -void cxl_mbox_exit(void); #endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index bfc8ee876278..d1d2caea5c62 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ #include <linux/io-64-nonatomic-hi-lo.h> +#include <linux/seq_file.h> #include <linux/device.h> #include <linux/delay.h> @@ -16,6 +17,8 @@ * for enumerating these registers and capabilities. */ +DECLARE_RWSEM(cxl_dpa_rwsem); + static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, int *target_map) { @@ -46,20 +49,22 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, */ int devm_cxl_add_passthrough_decoder(struct cxl_port *port) { - struct cxl_decoder *cxld; - struct cxl_dport *dport; + struct cxl_switch_decoder *cxlsd; + struct cxl_dport *dport = NULL; int single_port_map[1]; + unsigned long index; - cxld = cxl_switch_decoder_alloc(port, 1); - if (IS_ERR(cxld)) - return PTR_ERR(cxld); + cxlsd = cxl_switch_decoder_alloc(port, 1); + if (IS_ERR(cxlsd)) + return PTR_ERR(cxlsd); device_lock_assert(&port->dev); - dport = list_first_entry(&port->dports, typeof(*dport), list); + xa_for_each(&port->dports, index, dport) + break; single_port_map[0] = dport->port_id; - return add_hdm_decoder(port, cxld, single_port_map); + return add_hdm_decoder(port, &cxlsd->cxld, single_port_map); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, CXL); @@ -124,47 +129,577 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) return ERR_PTR(-ENXIO); } + dev_set_drvdata(dev, cxlhdm); + return cxlhdm; } EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); -static int to_interleave_granularity(u32 ctrl) +static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth) +{ + unsigned long long start = r->start, end = r->end; + + seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end, + r->name); +} + +void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) +{ + struct resource *p1, *p2; + + down_read(&cxl_dpa_rwsem); + for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) { + __cxl_dpa_debug(file, p1, 0); + for (p2 = p1->child; p2; p2 = p2->sibling) + __cxl_dpa_debug(file, p2, 1); + } + up_read(&cxl_dpa_rwsem); +} +EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL); + +/* + * Must be called in a context that synchronizes against this decoder's + * port ->remove() callback (like an endpoint decoder sysfs attribute) + */ +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct resource *res = cxled->dpa_res; + resource_size_t skip_start; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + /* save @skip_start, before @res is released */ + skip_start = res->start - cxled->skip; + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); + if (cxled->skip) + __release_region(&cxlds->dpa_res, skip_start, cxled->skip); + cxled->skip = 0; + cxled->dpa_res = NULL; + put_device(&cxled->cxld.dev); + port->hdm_end--; +} + +static void cxl_dpa_release(void *cxled) +{ + down_write(&cxl_dpa_rwsem); + __cxl_dpa_release(cxled); + up_write(&cxl_dpa_rwsem); +} + +/* + * Must be called from context that will not race port device + * unregistration, like decoder sysfs attribute methods + */ +static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + + lockdep_assert_held_write(&cxl_dpa_rwsem); + devm_remove_action(&port->dev, cxl_dpa_release, cxled); + __cxl_dpa_release(cxled); +} + +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &port->dev; + struct resource *res; + + lockdep_assert_held_write(&cxl_dpa_rwsem); + + if (!len) + goto success; + + if (cxled->dpa_res) { + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", + port->id, cxled->cxld.id, cxled->dpa_res); + return -EBUSY; + } + + if (port->hdm_end + 1 != cxled->cxld.id) { + /* + * Assumes alloc and commit order is always in hardware instance + * order per expectations from 8.2.5.12.20 Committing Decoder + * Programming that enforce decoder[m] committed before + * decoder[m+1] commit start. + */ + dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id, + cxled->cxld.id, port->id, port->hdm_end + 1); + return -EBUSY; + } + + if (skipped) { + res = __request_region(&cxlds->dpa_res, base - skipped, skipped, + dev_name(&cxled->cxld.dev), 0); + if (!res) { + dev_dbg(dev, + "decoder%d.%d: failed to reserve skipped space\n", + port->id, cxled->cxld.id); + return -EBUSY; + } + } + res = __request_region(&cxlds->dpa_res, base, len, + dev_name(&cxled->cxld.dev), 0); + if (!res) { + dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", + port->id, cxled->cxld.id); + if (skipped) + __release_region(&cxlds->dpa_res, base - skipped, + skipped); + return -EBUSY; + } + cxled->dpa_res = res; + cxled->skip = skipped; + + if (resource_contains(&cxlds->pmem_res, res)) + cxled->mode = CXL_DECODER_PMEM; + else if (resource_contains(&cxlds->ram_res, res)) + cxled->mode = CXL_DECODER_RAM; + else { + dev_dbg(dev, "decoder%d.%d: %pr mixed\n", port->id, + cxled->cxld.id, cxled->dpa_res); + cxled->mode = CXL_DECODER_MIXED; + } + +success: + port->hdm_end++; + get_device(&cxled->cxld.dev); + return 0; +} + +static int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) { - int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl); + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + down_write(&cxl_dpa_rwsem); + rc = __cxl_dpa_reserve(cxled, base, len, skipped); + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; - return 256 << val; + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } -static int to_interleave_ways(u32 ctrl) +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) { - int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl); + resource_size_t size = 0; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + size = resource_size(cxled->dpa_res); + up_read(&cxl_dpa_rwsem); - switch (val) { - case 0 ... 4: - return 1 << val; - case 8 ... 10: - return 3 << (val - 8); + return size; +} + +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t base = -1; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + base = cxled->dpa_res->start; + up_read(&cxl_dpa_rwsem); + + return base; +} + +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + struct device *dev = &cxled->cxld.dev; + int rc; + + down_write(&cxl_dpa_rwsem); + if (!cxled->dpa_res) { + rc = 0; + goto out; + } + if (cxled->cxld.region) { + dev_dbg(dev, "decoder assigned to: %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.id != port->hdm_end) { + dev_dbg(dev, "expected decoder%d.%d\n", port->id, + port->hdm_end); + rc = -EBUSY; + goto out; + } + devm_cxl_dpa_release(cxled); + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + return rc; +} + +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + int rc; + + switch (mode) { + case CXL_DECODER_RAM: + case CXL_DECODER_PMEM: + break; default: + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; + } + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + rc = -EBUSY; + goto out; + } + + /* + * Only allow modes that are supported by the current partition + * configuration + */ + if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) { + dev_dbg(dev, "no available pmem capacity\n"); + rc = -ENXIO; + goto out; + } + if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) { + dev_dbg(dev, "no available ram capacity\n"); + rc = -ENXIO; + goto out; + } + + cxled->mode = mode; + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + + return rc; +} + +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + resource_size_t free_ram_start, free_pmem_start; + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + resource_size_t start, avail, skip; + struct resource *p, *last; + int rc; + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.region) { + dev_dbg(dev, "decoder attached to %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } + + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + + for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_ram_start = last->end + 1; + else + free_ram_start = cxlds->ram_res.start; + + for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_pmem_start = last->end + 1; + else + free_pmem_start = cxlds->pmem_res.start; + + if (cxled->mode == CXL_DECODER_RAM) { + start = free_ram_start; + avail = cxlds->ram_res.end - start + 1; + skip = 0; + } else if (cxled->mode == CXL_DECODER_PMEM) { + resource_size_t skip_start, skip_end; + + start = free_pmem_start; + avail = cxlds->pmem_res.end - start + 1; + skip_start = free_ram_start; + + /* + * If some pmem is already allocated, then that allocation + * already handled the skip. + */ + if (cxlds->pmem_res.child && + skip_start == cxlds->pmem_res.child->start) + skip_end = skip_start - 1; + else + skip_end = start - 1; + skip = skip_end - skip_start + 1; + } else { + dev_dbg(dev, "mode not set\n"); + rc = -EINVAL; + goto out; + } + + if (size > avail) { + dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, + cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem", + &avail); + rc = -ENOSPC; + goto out; + } + + rc = __cxl_dpa_reserve(cxled, start, size, skip); +out: + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} + +static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) +{ + u16 eig; + u8 eiw; + + /* + * Input validation ensures these warns never fire, but otherwise + * suppress unititalized variable usage warnings. + */ + if (WARN_ONCE(ways_to_cxl(cxld->interleave_ways, &eiw), + "invalid interleave_ways: %d\n", cxld->interleave_ways)) + return; + if (WARN_ONCE(granularity_to_cxl(cxld->interleave_granularity, &eig), + "invalid interleave_granularity: %d\n", + cxld->interleave_granularity)) + return; + + u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK); + u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK); + *ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; +} + +static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl) +{ + u32p_replace_bits(ctrl, !!(cxld->target_type == 3), + CXL_HDM_DECODER0_CTRL_TYPE); +} + +static int cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt) +{ + struct cxl_dport **t = &cxlsd->target[0]; + int ways = cxlsd->cxld.interleave_ways; + + if (dev_WARN_ONCE(&cxlsd->cxld.dev, + ways > 8 || ways > cxlsd->nr_targets, + "ways: %d overflows targets: %d\n", ways, + cxlsd->nr_targets)) + return -ENXIO; + + *tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id); + if (ways > 1) + *tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id); + if (ways > 2) + *tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id); + if (ways > 3) + *tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id); + if (ways > 4) + *tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id); + if (ways > 5) + *tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id); + if (ways > 6) + *tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id); + if (ways > 7) + *tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id); + + return 0; +} + +/* + * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set + * committed or error within 10ms, but just be generous with 20ms to account for + * clock skew and other marginal behavior + */ +#define COMMIT_TIMEOUT_MS 20 +static int cxld_await_commit(void __iomem *hdm, int id) +{ + u32 ctrl; + int i; + + for (i = 0; i < COMMIT_TIMEOUT_MS; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + return -EIO; + } + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl)) + return 0; + fsleep(1000); + } + + return -ETIMEDOUT; +} + +static int cxl_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id, rc; + u64 base, size; + u32 ctrl; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + if (port->commit_end + 1 != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end + 1); + return -EBUSY; + } + + down_read(&cxl_dpa_rwsem); + /* common decoder settings */ + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); + cxld_set_interleave(cxld, &ctrl); + cxld_set_type(cxld, &ctrl); + base = cxld->hpa_range.start; + size = range_len(&cxld->hpa_range); + + writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id); + void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id); + u64 targets; + + rc = cxlsd_set_targets(cxlsd, &targets); + if (rc) { + dev_dbg(&port->dev, "%s: target configuration error\n", + dev_name(&cxld->dev)); + goto err; + } + + writel(upper_32_bits(targets), tl_hi); + writel(lower_32_bits(targets), tl_lo); + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id); + void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id); + + writel(upper_32_bits(cxled->skip), sk_hi); + writel(lower_32_bits(cxled->skip), sk_lo); + } + + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end++; + rc = cxld_await_commit(hdm, cxld->id); +err: + if (rc) { + dev_dbg(&port->dev, "%s: error %d committing decoder\n", + dev_name(&cxld->dev), rc); + cxld->reset(cxld); + return rc; + } + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int cxl_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id; + u32 ctrl; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) return 0; + + if (port->commit_end != id) { + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + return -EBUSY; } + + down_read(&cxl_dpa_rwsem); + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + + writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end--; + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + return 0; } static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, - int *target_map, void __iomem *hdm, int which) + int *target_map, void __iomem *hdm, int which, + u64 *dpa_base) { - u64 size, base; + struct cxl_endpoint_decoder *cxled = NULL; + u64 size, base, skip, dpa_size; + bool committed; + u32 remainder; + int i, rc; u32 ctrl; - int i; union { u64 value; unsigned char target_id[8]; } target_list; + if (is_endpoint_decoder(&cxld->dev)) + cxled = to_cxl_endpoint_decoder(&cxld->dev); + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); + committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED); + cxld->commit = cxl_decoder_commit; + cxld->reset = cxl_decoder_reset; - if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + if (!committed) size = 0; if (base == U64_MAX || size == U64_MAX) { dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n", @@ -172,39 +707,77 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, return -ENXIO; } - cxld->decoder_range = (struct range) { + cxld->hpa_range = (struct range) { .start = base, .end = base + size - 1, }; - /* switch decoders are always enabled if committed */ - if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + /* decoders are enabled if committed */ + if (committed) { cxld->flags |= CXL_DECODER_F_ENABLE; if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) cxld->flags |= CXL_DECODER_F_LOCK; + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl)) + cxld->target_type = CXL_DECODER_EXPANDER; + else + cxld->target_type = CXL_DECODER_ACCELERATOR; + if (cxld->id != port->commit_end + 1) { + dev_warn(&port->dev, + "decoder%d.%d: Committed out of order\n", + port->id, cxld->id); + return -ENXIO; + } + port->commit_end = cxld->id; + } else { + /* unless / until type-2 drivers arrive, assume type-3 */ + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl) == 0) { + ctrl |= CXL_HDM_DECODER0_CTRL_TYPE; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + } + cxld->target_type = CXL_DECODER_EXPANDER; } - cxld->interleave_ways = to_interleave_ways(ctrl); - if (!cxld->interleave_ways) { + rc = cxl_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl), + &cxld->interleave_ways); + if (rc) { dev_warn(&port->dev, "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n", port->id, cxld->id, ctrl); - return -ENXIO; + return rc; } - cxld->interleave_granularity = to_interleave_granularity(ctrl); + rc = cxl_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl), + &cxld->interleave_granularity); + if (rc) + return rc; - if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl)) - cxld->target_type = CXL_DECODER_EXPANDER; - else - cxld->target_type = CXL_DECODER_ACCELERATOR; + if (!cxled) { + target_list.value = + ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); + for (i = 0; i < cxld->interleave_ways; i++) + target_map[i] = target_list.target_id[i]; - if (is_endpoint_decoder(&cxld->dev)) return 0; + } - target_list.value = - ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); - for (i = 0; i < cxld->interleave_ways; i++) - target_map[i] = target_list.target_id[i]; + if (!committed) + return 0; + dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder); + if (remainder) { + dev_err(&port->dev, + "decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n", + port->id, cxld->id, size, cxld->interleave_ways); + return -ENXIO; + } + skip = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SKIP_LOW(which)); + rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); + if (rc) { + dev_err(&port->dev, + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + port->id, cxld->id, *dpa_base, + *dpa_base + dpa_size + skip - 1, rc); + return rc; + } + *dpa_base += dpa_size + skip; return 0; } @@ -216,7 +789,8 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; - int i, committed, failed; + int i, committed; + u64 dpa_base = 0; u32 ctrl; /* @@ -236,27 +810,37 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) if (committed != cxlhdm->decoder_count) msleep(20); - for (i = 0, failed = 0; i < cxlhdm->decoder_count; i++) { + for (i = 0; i < cxlhdm->decoder_count; i++) { int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; int rc, target_count = cxlhdm->target_count; struct cxl_decoder *cxld; - if (is_cxl_endpoint(port)) - cxld = cxl_endpoint_decoder_alloc(port); - else - cxld = cxl_switch_decoder_alloc(port, target_count); - if (IS_ERR(cxld)) { - dev_warn(&port->dev, - "Failed to allocate the decoder\n"); - return PTR_ERR(cxld); + if (is_cxl_endpoint(port)) { + struct cxl_endpoint_decoder *cxled; + + cxled = cxl_endpoint_decoder_alloc(port); + if (IS_ERR(cxled)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxled); + } + cxld = &cxled->cxld; + } else { + struct cxl_switch_decoder *cxlsd; + + cxlsd = cxl_switch_decoder_alloc(port, target_count); + if (IS_ERR(cxlsd)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxlsd); + } + cxld = &cxlsd->cxld; } - rc = init_hdm_decoder(port, cxld, target_map, - cxlhdm->regs.hdm_decoder, i); + rc = init_hdm_decoder(port, cxld, target_map, hdm, i, &dpa_base); if (rc) { put_device(&cxld->dev); - failed++; - continue; + return rc; } rc = add_hdm_decoder(port, cxld, target_map); if (rc) { @@ -266,11 +850,6 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) } } - if (failed == cxlhdm->decoder_count) { - dev_err(&port->dev, "No valid decoders found\n"); - return -ENXIO; - } - return 0; } EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c index cbf23beebebe..16176b9278b4 100644 --- a/drivers/cxl/core/mbox.c +++ b/drivers/cxl/core/mbox.c @@ -718,12 +718,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL); */ static int cxl_mem_get_partition_info(struct cxl_dev_state *cxlds) { - struct cxl_mbox_get_partition_info { - __le64 active_volatile_cap; - __le64 active_persistent_cap; - __le64 next_volatile_cap; - __le64 next_persistent_cap; - } __packed pi; + struct cxl_mbox_get_partition_info pi; int rc; rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_PARTITION_INFO, NULL, 0, @@ -773,15 +768,6 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) cxlds->partition_align_bytes = le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER; - dev_dbg(cxlds->dev, - "Identify Memory Device\n" - " total_bytes = %#llx\n" - " volatile_only_bytes = %#llx\n" - " persistent_only_bytes = %#llx\n" - " partition_align_bytes = %#llx\n", - cxlds->total_bytes, cxlds->volatile_only_bytes, - cxlds->persistent_only_bytes, cxlds->partition_align_bytes); - cxlds->lsa_size = le32_to_cpu(id.lsa_size); memcpy(cxlds->firmware_version, id.fw_revision, sizeof(id.fw_revision)); @@ -789,42 +775,63 @@ int cxl_dev_state_identify(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, CXL); -int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) { int rc; - if (cxlds->partition_align_bytes == 0) { - cxlds->ram_range.start = 0; - cxlds->ram_range.end = cxlds->volatile_only_bytes - 1; - cxlds->pmem_range.start = cxlds->volatile_only_bytes; - cxlds->pmem_range.end = cxlds->volatile_only_bytes + - cxlds->persistent_only_bytes - 1; + res->name = type; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); return 0; } - - rc = cxl_mem_get_partition_info(cxlds); + rc = request_resource(parent, res); if (rc) { - dev_err(cxlds->dev, "Failed to query partition information\n"); + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); return rc; } - dev_dbg(cxlds->dev, - "Get Partition Info\n" - " active_volatile_bytes = %#llx\n" - " active_persistent_bytes = %#llx\n" - " next_volatile_bytes = %#llx\n" - " next_persistent_bytes = %#llx\n", - cxlds->active_volatile_bytes, cxlds->active_persistent_bytes, - cxlds->next_volatile_bytes, cxlds->next_persistent_bytes); + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + + return 0; +} - cxlds->ram_range.start = 0; - cxlds->ram_range.end = cxlds->active_volatile_bytes - 1; +int cxl_mem_create_range_info(struct cxl_dev_state *cxlds) +{ + struct device *dev = cxlds->dev; + int rc; - cxlds->pmem_range.start = cxlds->active_volatile_bytes; - cxlds->pmem_range.end = - cxlds->active_volatile_bytes + cxlds->active_persistent_bytes - 1; + cxlds->dpa_res = + (struct resource)DEFINE_RES_MEM(0, cxlds->total_bytes); - return 0; + if (cxlds->partition_align_bytes == 0) { + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->volatile_only_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->volatile_only_bytes, + cxlds->persistent_only_bytes, "pmem"); + } + + rc = cxl_mem_get_partition_info(cxlds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->ram_res, 0, + cxlds->active_volatile_bytes, "ram"); + if (rc) + return rc; + return add_dpa_res(dev, &cxlds->dpa_res, &cxlds->pmem_res, + cxlds->active_volatile_bytes, + cxlds->active_persistent_bytes, "pmem"); } EXPORT_SYMBOL_NS_GPL(cxl_mem_create_range_info, CXL); @@ -845,19 +852,11 @@ struct cxl_dev_state *cxl_dev_state_create(struct device *dev) } EXPORT_SYMBOL_NS_GPL(cxl_dev_state_create, CXL); -static struct dentry *cxl_debugfs; - void __init cxl_mbox_init(void) { struct dentry *mbox_debugfs; - cxl_debugfs = debugfs_create_dir("cxl", NULL); - mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs); + mbox_debugfs = cxl_debugfs_create_dir("mbox"); debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs, &cxl_raw_allow_all); } - -void cxl_mbox_exit(void) -{ - debugfs_remove_recursive(cxl_debugfs); -} diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index f7cdcd33504a..20ce488a7754 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -68,7 +68,7 @@ static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = range_len(&cxlds->ram_range); + unsigned long long len = resource_size(&cxlds->ram_res); return sysfs_emit(buf, "%#llx\n", len); } @@ -81,7 +81,7 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_dev_state *cxlds = cxlmd->cxlds; - unsigned long long len = range_len(&cxlds->pmem_range); + unsigned long long len = resource_size(&cxlds->pmem_res); return sysfs_emit(buf, "%#llx\n", len); } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c4c99ff7b55e..9240df53ed87 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,7 @@ #include <linux/device.h> #include <linux/delay.h> #include <linux/pci.h> +#include <linux/pci-doe.h> #include <cxlpci.h> #include <cxlmem.h> #include <cxl.h> @@ -225,7 +226,6 @@ static int dvsec_range_allowed(struct device *dev, void *arg) { struct range *dev_range = arg; struct cxl_decoder *cxld; - struct range root_range; if (!is_root_decoder(dev)) return 0; @@ -237,12 +237,7 @@ static int dvsec_range_allowed(struct device *dev, void *arg) if (!(cxld->flags & CXL_DECODER_F_RAM)) return 0; - root_range = (struct range) { - .start = cxld->platform_res.start, - .end = cxld->platform_res.end, - }; - - return range_contains(&root_range, dev_range); + return range_contains(&cxld->hpa_range, dev_range); } static void disable_hdm(void *_cxlhdm) @@ -458,3 +453,175 @@ hdm_init: return 0; } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, CXL); + +#define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff +#define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA 0 +#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE 0xffff0000 +#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY 0xffff +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2 + +static struct pci_doe_mb *find_cdat_doe(struct device *uport) +{ + struct cxl_memdev *cxlmd; + struct cxl_dev_state *cxlds; + unsigned long index; + void *entry; + + cxlmd = to_cxl_memdev(uport); + cxlds = cxlmd->cxlds; + + xa_for_each(&cxlds->doe_mbs, index, entry) { + struct pci_doe_mb *cur = entry; + + if (pci_doe_supports_prot(cur, PCI_DVSEC_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS)) + return cur; + } + + return NULL; +} + +#define CDAT_DOE_REQ(entry_handle) \ + (FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE, \ + CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE, \ + CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle))) + +static void cxl_doe_task_complete(struct pci_doe_task *task) +{ + complete(task->private); +} + +struct cdat_doe_task { + u32 request_pl; + u32 response_pl[32]; + struct completion c; + struct pci_doe_task task; +}; + +#define DECLARE_CDAT_DOE_TASK(req, cdt) \ +struct cdat_doe_task cdt = { \ + .c = COMPLETION_INITIALIZER_ONSTACK(cdt.c), \ + .request_pl = req, \ + .task = { \ + .prot.vid = PCI_DVSEC_VENDOR_ID_CXL, \ + .prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS, \ + .request_pl = &cdt.request_pl, \ + .request_pl_sz = sizeof(cdt.request_pl), \ + .response_pl = cdt.response_pl, \ + .response_pl_sz = sizeof(cdt.response_pl), \ + .complete = cxl_doe_task_complete, \ + .private = &cdt.c, \ + } \ +} + +static int cxl_cdat_get_length(struct device *dev, + struct pci_doe_mb *cdat_doe, + size_t *length) +{ + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(0), t); + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + if (t.task.rv < sizeof(u32)) + return -EIO; + + *length = t.response_pl[1]; + dev_dbg(dev, "CDAT length %zu\n", *length); + + return 0; +} + +static int cxl_cdat_read_table(struct device *dev, + struct pci_doe_mb *cdat_doe, + struct cxl_cdat *cdat) +{ + size_t length = cdat->length; + u32 *data = cdat->table; + int entry_handle = 0; + + do { + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t); + size_t entry_dw; + u32 *entry; + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + /* 1 DW header + 1 DW data min */ + if (t.task.rv < (2 * sizeof(u32))) + return -EIO; + + /* Get the CXL table access header entry handle */ + entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, + t.response_pl[0]); + entry = t.response_pl + 1; + entry_dw = t.task.rv / sizeof(u32); + /* Skip Header */ + entry_dw -= 1; + entry_dw = min(length / sizeof(u32), entry_dw); + /* Prevent length < 1 DW from causing a buffer overflow */ + if (entry_dw) { + memcpy(data, entry, entry_dw * sizeof(u32)); + length -= entry_dw * sizeof(u32); + data += entry_dw; + } + } while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY); + + return 0; +} + +/** + * read_cdat_data - Read the CDAT data on this port + * @port: Port to read data from + * + * This call will sleep waiting for responses from the DOE mailbox. + */ +void read_cdat_data(struct cxl_port *port) +{ + struct pci_doe_mb *cdat_doe; + struct device *dev = &port->dev; + struct device *uport = port->uport; + size_t cdat_length; + int rc; + + cdat_doe = find_cdat_doe(uport); + if (!cdat_doe) { + dev_dbg(dev, "No CDAT mailbox\n"); + return; + } + + port->cdat_available = true; + + if (cxl_cdat_get_length(dev, cdat_doe, &cdat_length)) { + dev_dbg(dev, "No CDAT length\n"); + return; + } + + port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL); + if (!port->cdat.table) + return; + + port->cdat.length = cdat_length; + rc = cxl_cdat_read_table(dev, cdat_doe, &port->cdat); + if (rc) { + /* Don't leave table data allocated on error */ + devm_kfree(dev, port->cdat.table); + port->cdat.table = NULL; + port->cdat.length = 0; + dev_err(dev, "CDAT data read error\n"); + } +} +EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index bec7cfb54ebf..1d12a8206444 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -62,9 +62,9 @@ static int match_nvdimm_bridge(struct device *dev, void *data) return is_cxl_nvdimm_bridge(dev); } -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd) +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *start) { - struct cxl_port *port = find_cxl_root(&cxl_nvd->dev); + struct cxl_port *port = find_cxl_root(start); struct device *dev; if (!port) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index dbce99bdffab..bffde862de0b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/memregion.h> #include <linux/workqueue.h> +#include <linux/debugfs.h> #include <linux/device.h> #include <linux/module.h> #include <linux/pci.h> @@ -42,6 +44,8 @@ static int cxl_device_id(struct device *dev) return CXL_DEVICE_NVDIMM_BRIDGE; if (dev->type == &cxl_nvdimm_type) return CXL_DEVICE_NVDIMM; + if (dev->type == CXL_PMEM_REGION_TYPE()) + return CXL_DEVICE_PMEM_REGION; if (is_cxl_port(dev)) { if (is_cxl_root(to_cxl_port(dev))) return CXL_DEVICE_ROOT; @@ -49,6 +53,8 @@ static int cxl_device_id(struct device *dev) } if (is_cxl_memdev(dev)) return CXL_DEVICE_MEMORY_EXPANDER; + if (dev->type == CXL_REGION_TYPE()) + return CXL_DEVICE_REGION; return 0; } @@ -73,14 +79,8 @@ static ssize_t start_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_decoder *cxld = to_cxl_decoder(dev); - u64 start; - if (is_root_decoder(dev)) - start = cxld->platform_res.start; - else - start = cxld->decoder_range.start; - - return sysfs_emit(buf, "%#llx\n", start); + return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start); } static DEVICE_ATTR_ADMIN_RO(start); @@ -88,14 +88,8 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_decoder *cxld = to_cxl_decoder(dev); - u64 size; - - if (is_root_decoder(dev)) - size = resource_size(&cxld->platform_res); - else - size = range_len(&cxld->decoder_range); - return sysfs_emit(buf, "%#llx\n", size); + return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range)); } static DEVICE_ATTR_RO(size); @@ -131,20 +125,21 @@ static ssize_t target_type_show(struct device *dev, } static DEVICE_ATTR_RO(target_type); -static ssize_t emit_target_list(struct cxl_decoder *cxld, char *buf) +static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf) { + struct cxl_decoder *cxld = &cxlsd->cxld; ssize_t offset = 0; int i, rc = 0; for (i = 0; i < cxld->interleave_ways; i++) { - struct cxl_dport *dport = cxld->target[i]; + struct cxl_dport *dport = cxlsd->target[i]; struct cxl_dport *next = NULL; if (!dport) break; if (i + 1 < cxld->interleave_ways) - next = cxld->target[i + 1]; + next = cxlsd->target[i + 1]; rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id, next ? "," : ""); if (rc < 0) @@ -158,15 +153,15 @@ static ssize_t emit_target_list(struct cxl_decoder *cxld, char *buf) static ssize_t target_list_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct cxl_decoder *cxld = to_cxl_decoder(dev); + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); ssize_t offset; unsigned int seq; int rc; do { - seq = read_seqbegin(&cxld->target_lock); - rc = emit_target_list(cxld, buf); - } while (read_seqretry(&cxld->target_lock, seq)); + seq = read_seqbegin(&cxlsd->target_lock); + rc = emit_target_list(cxlsd, buf); + } while (read_seqretry(&cxlsd->target_lock, seq)); if (rc < 0) return rc; @@ -180,10 +175,121 @@ static ssize_t target_list_show(struct device *dev, } static DEVICE_ATTR_RO(target_list); +static ssize_t mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + switch (cxled->mode) { + case CXL_DECODER_RAM: + return sysfs_emit(buf, "ram\n"); + case CXL_DECODER_PMEM: + return sysfs_emit(buf, "pmem\n"); + case CXL_DECODER_NONE: + return sysfs_emit(buf, "none\n"); + case CXL_DECODER_MIXED: + default: + return sysfs_emit(buf, "mixed\n"); + } +} + +static ssize_t mode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + enum cxl_decoder_mode mode; + ssize_t rc; + + if (sysfs_streq(buf, "pmem")) + mode = CXL_DECODER_PMEM; + else if (sysfs_streq(buf, "ram")) + mode = CXL_DECODER_RAM; + else + return -EINVAL; + + rc = cxl_dpa_set_mode(cxled, mode); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + u64 base = cxl_dpa_resource_start(cxled); + + return sysfs_emit(buf, "%#llx\n", base); +} +static DEVICE_ATTR_RO(dpa_resource); + +static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + resource_size_t size = cxl_dpa_size(cxled); + + return sysfs_emit(buf, "%pa\n", &size); +} + +static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + unsigned long long size; + ssize_t rc; + + rc = kstrtoull(buf, 0, &size); + if (rc) + return rc; + + if (!IS_ALIGNED(size, SZ_256M)) + return -EINVAL; + + rc = cxl_dpa_free(cxled); + if (rc) + return rc; + + if (size == 0) + return len; + + rc = cxl_dpa_alloc(cxled, size); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(dpa_size); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_granularity); +} + +static DEVICE_ATTR_RO(interleave_granularity); + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_ways); +} + +static DEVICE_ATTR_RO(interleave_ways); + static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_locked.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_interleave_ways.attr, NULL, }; @@ -197,11 +303,35 @@ static struct attribute *cxl_decoder_root_attrs[] = { &dev_attr_cap_type2.attr, &dev_attr_cap_type3.attr, &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(create_pmem_region) + SET_CXL_REGION_ATTR(delete_region) NULL, }; +static bool can_create_pmem(struct cxl_root_decoder *cxlrd) +{ + unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM; + + return (cxlrd->cxlsd.cxld.flags & flags) == flags; +} + +static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd)) + return 0; + + if (a == CXL_REGION_ATTR(delete_region) && !can_create_pmem(cxlrd)) + return 0; + + return a->mode; +} + static struct attribute_group cxl_decoder_root_attribute_group = { .attrs = cxl_decoder_root_attrs, + .is_visible = cxl_root_decoder_visible, }; static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { @@ -214,6 +344,7 @@ static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { static struct attribute *cxl_decoder_switch_attrs[] = { &dev_attr_target_type.attr, &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(region) NULL, }; @@ -230,6 +361,10 @@ static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { static struct attribute *cxl_decoder_endpoint_attrs[] = { &dev_attr_target_type.attr, + &dev_attr_mode.attr, + &dev_attr_dpa_size.attr, + &dev_attr_dpa_resource.attr, + SET_CXL_REGION_ATTR(region) NULL, }; @@ -244,31 +379,64 @@ static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = { NULL, }; -static void cxl_decoder_release(struct device *dev) +static void __cxl_decoder_release(struct cxl_decoder *cxld) { - struct cxl_decoder *cxld = to_cxl_decoder(dev); - struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_port *port = to_cxl_port(cxld->dev.parent); ida_free(&port->decoder_ida, cxld->id); - kfree(cxld); put_device(&port->dev); } +static void cxl_endpoint_decoder_release(struct device *dev) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + __cxl_decoder_release(&cxled->cxld); + kfree(cxled); +} + +static void cxl_switch_decoder_release(struct device *dev) +{ + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + __cxl_decoder_release(&cxlsd->cxld); + kfree(cxlsd); +} + +struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_root_decoder(dev), + "not a cxl_root_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, CXL); + +static void cxl_root_decoder_release(struct device *dev) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (atomic_read(&cxlrd->region_id) >= 0) + memregion_free(atomic_read(&cxlrd->region_id)); + __cxl_decoder_release(&cxlrd->cxlsd.cxld); + kfree(cxlrd); +} + static const struct device_type cxl_decoder_endpoint_type = { .name = "cxl_decoder_endpoint", - .release = cxl_decoder_release, + .release = cxl_endpoint_decoder_release, .groups = cxl_decoder_endpoint_attribute_groups, }; static const struct device_type cxl_decoder_switch_type = { .name = "cxl_decoder_switch", - .release = cxl_decoder_release, + .release = cxl_switch_decoder_release, .groups = cxl_decoder_switch_attribute_groups, }; static const struct device_type cxl_decoder_root_type = { .name = "cxl_decoder_root", - .release = cxl_decoder_release, + .release = cxl_root_decoder_release, .groups = cxl_decoder_root_attribute_groups, }; @@ -283,39 +451,63 @@ bool is_root_decoder(struct device *dev) } EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL); -bool is_cxl_decoder(struct device *dev) +bool is_switch_decoder(struct device *dev) { - return dev->type && dev->type->release == cxl_decoder_release; + return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type; } -EXPORT_SYMBOL_NS_GPL(is_cxl_decoder, CXL); struct cxl_decoder *to_cxl_decoder(struct device *dev) { - if (dev_WARN_ONCE(dev, dev->type->release != cxl_decoder_release, + if (dev_WARN_ONCE(dev, + !is_switch_decoder(dev) && !is_endpoint_decoder(dev), "not a cxl_decoder device\n")) return NULL; return container_of(dev, struct cxl_decoder, dev); } EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL); +struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev), + "not a cxl_endpoint_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_endpoint_decoder, cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, CXL); + +struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_switch_decoder(dev), + "not a cxl_switch_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_switch_decoder, cxld.dev); +} + static void cxl_ep_release(struct cxl_ep *ep) { - if (!ep) - return; - list_del(&ep->list); put_device(ep->ep); kfree(ep); } +static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep) +{ + if (!ep) + return; + xa_erase(&port->endpoints, (unsigned long) ep->ep); + cxl_ep_release(ep); +} + static void cxl_port_release(struct device *dev) { struct cxl_port *port = to_cxl_port(dev); - struct cxl_ep *ep, *_e; + unsigned long index; + struct cxl_ep *ep; - device_lock(dev); - list_for_each_entry_safe(ep, _e, &port->endpoints, list) - cxl_ep_release(ep); - device_unlock(dev); + xa_for_each(&port->endpoints, index, ep) + cxl_ep_remove(port, ep); + xa_destroy(&port->endpoints); + xa_destroy(&port->dports); + xa_destroy(&port->regions); ida_free(&cxl_port_ida, port->id); kfree(port); } @@ -370,7 +562,7 @@ static void unregister_port(void *_port) lock_dev = &parent->dev; device_lock_assert(lock_dev); - port->uport = NULL; + port->dead = true; device_unregister(&port->dev); } @@ -395,7 +587,7 @@ static struct lock_class_key cxl_port_key; static struct cxl_port *cxl_port_alloc(struct device *uport, resource_size_t component_reg_phys, - struct cxl_port *parent_port) + struct cxl_dport *parent_dport) { struct cxl_port *port; struct device *dev; @@ -409,6 +601,7 @@ static struct cxl_port *cxl_port_alloc(struct device *uport, if (rc < 0) goto err; port->id = rc; + port->uport = uport; /* * The top-level cxl_port "cxl_root" does not have a cxl_port as @@ -417,17 +610,37 @@ static struct cxl_port *cxl_port_alloc(struct device *uport, * description. */ dev = &port->dev; - if (parent_port) { + if (parent_dport) { + struct cxl_port *parent_port = parent_dport->port; + struct cxl_port *iter; + dev->parent = &parent_port->dev; port->depth = parent_port->depth + 1; + port->parent_dport = parent_dport; + + /* + * walk to the host bridge, or the first ancestor that knows + * the host bridge + */ + iter = port; + while (!iter->host_bridge && + !is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + if (iter->host_bridge) + port->host_bridge = iter->host_bridge; + else + port->host_bridge = iter->uport; + dev_dbg(uport, "host-bridge: %s\n", dev_name(port->host_bridge)); } else dev->parent = uport; - port->uport = uport; port->component_reg_phys = component_reg_phys; ida_init(&port->decoder_ida); - INIT_LIST_HEAD(&port->dports); - INIT_LIST_HEAD(&port->endpoints); + port->hdm_end = -1; + port->commit_end = -1; + xa_init(&port->dports); + xa_init(&port->endpoints); + xa_init(&port->regions); device_initialize(dev); lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth); @@ -447,24 +660,24 @@ err: * @host: host device for devm operations * @uport: "physical" device implementing this upstream port * @component_reg_phys: (optional) for configurable cxl_port instances - * @parent_port: next hop up in the CXL memory decode hierarchy + * @parent_dport: next hop up in the CXL memory decode hierarchy */ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, - struct cxl_port *parent_port) + struct cxl_dport *parent_dport) { struct cxl_port *port; struct device *dev; int rc; - port = cxl_port_alloc(uport, component_reg_phys, parent_port); + port = cxl_port_alloc(uport, component_reg_phys, parent_dport); if (IS_ERR(port)) return port; dev = &port->dev; if (is_cxl_memdev(uport)) rc = dev_set_name(dev, "endpoint%d", port->id); - else if (parent_port) + else if (parent_dport) rc = dev_set_name(dev, "port%d", port->id); else rc = dev_set_name(dev, "root%d", port->id); @@ -556,17 +769,13 @@ static int match_root_child(struct device *dev, const void *match) return 0; port = to_cxl_port(dev); - device_lock(dev); - list_for_each_entry(dport, &port->dports, list) { - iter = match; - while (iter) { - if (iter == dport->dport) - goto out; - iter = iter->parent; - } + iter = match; + while (iter) { + dport = cxl_find_dport_by_dev(port, iter); + if (dport) + break; + iter = iter->parent; } -out: - device_unlock(dev); return !!iter; } @@ -590,9 +799,10 @@ EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL); static struct cxl_dport *find_dport(struct cxl_port *port, int id) { struct cxl_dport *dport; + unsigned long index; device_lock_assert(&port->dev); - list_for_each_entry (dport, &port->dports, list) + xa_for_each(&port->dports, index, dport) if (dport->port_id == id) return dport; return NULL; @@ -604,15 +814,15 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) device_lock_assert(&port->dev); dup = find_dport(port, new->port_id); - if (dup) + if (dup) { dev_err(&port->dev, "unable to add dport%d-%s non-unique port id (%s)\n", new->port_id, dev_name(new->dport), dev_name(dup->dport)); - else - list_add_tail(&new->list, &port->dports); - - return dup ? -EEXIST : 0; + return -EBUSY; + } + return xa_insert(&port->dports, (unsigned long)new->dport, new, + GFP_KERNEL); } /* @@ -639,10 +849,8 @@ static void cxl_dport_remove(void *data) struct cxl_dport *dport = data; struct cxl_port *port = dport->port; + xa_erase(&port->dports, (unsigned long) dport->dport); put_device(dport->dport); - cond_cxl_root_lock(port); - list_del(&dport->list); - cond_cxl_root_unlock(port); } static void cxl_dport_unlink(void *data) @@ -694,7 +902,6 @@ struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, if (!dport) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&dport->list); dport->dport = dport_dev; dport->port_id = port_id; dport->component_reg_phys = component_reg_phys; @@ -723,44 +930,33 @@ struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL); -static struct cxl_ep *find_ep(struct cxl_port *port, struct device *ep_dev) -{ - struct cxl_ep *ep; - - device_lock_assert(&port->dev); - list_for_each_entry(ep, &port->endpoints, list) - if (ep->ep == ep_dev) - return ep; - return NULL; -} - -static int add_ep(struct cxl_port *port, struct cxl_ep *new) +static int add_ep(struct cxl_ep *new) { - struct cxl_ep *dup; + struct cxl_port *port = new->dport->port; + int rc; device_lock(&port->dev); if (port->dead) { device_unlock(&port->dev); return -ENXIO; } - dup = find_ep(port, new->ep); - if (!dup) - list_add_tail(&new->list, &port->endpoints); + rc = xa_insert(&port->endpoints, (unsigned long)new->ep, new, + GFP_KERNEL); device_unlock(&port->dev); - return dup ? -EEXIST : 0; + return rc; } /** * cxl_add_ep - register an endpoint's interest in a port - * @port: a port in the endpoint's topology ancestry + * @dport: the dport that routes to @ep_dev * @ep_dev: device representing the endpoint * * Intermediate CXL ports are scanned based on the arrival of endpoints. * When those endpoints depart the port can be destroyed once all * endpoints that care about that port have been removed. */ -static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev) +static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev) { struct cxl_ep *ep; int rc; @@ -769,10 +965,10 @@ static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev) if (!ep) return -ENOMEM; - INIT_LIST_HEAD(&ep->list); ep->ep = get_device(ep_dev); + ep->dport = dport; - rc = add_ep(port, ep); + rc = add_ep(ep); if (rc) cxl_ep_release(ep); return rc; @@ -781,11 +977,13 @@ static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev) struct cxl_find_port_ctx { const struct device *dport_dev; const struct cxl_port *parent_port; + struct cxl_dport **dport; }; static int match_port_by_dport(struct device *dev, const void *data) { const struct cxl_find_port_ctx *ctx = data; + struct cxl_dport *dport; struct cxl_port *port; if (!is_cxl_port(dev)) @@ -794,7 +992,10 @@ static int match_port_by_dport(struct device *dev, const void *data) return 0; port = to_cxl_port(dev); - return cxl_find_dport_by_dev(port, ctx->dport_dev) != NULL; + dport = cxl_find_dport_by_dev(port, ctx->dport_dev); + if (ctx->dport) + *ctx->dport = dport; + return dport != NULL; } static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx) @@ -810,24 +1011,32 @@ static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx) return NULL; } -static struct cxl_port *find_cxl_port(struct device *dport_dev) +static struct cxl_port *find_cxl_port(struct device *dport_dev, + struct cxl_dport **dport) { struct cxl_find_port_ctx ctx = { .dport_dev = dport_dev, + .dport = dport, }; + struct cxl_port *port; - return __find_cxl_port(&ctx); + port = __find_cxl_port(&ctx); + return port; } static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port, - struct device *dport_dev) + struct device *dport_dev, + struct cxl_dport **dport) { struct cxl_find_port_ctx ctx = { .dport_dev = dport_dev, .parent_port = parent_port, + .dport = dport, }; + struct cxl_port *port; - return __find_cxl_port(&ctx); + port = __find_cxl_port(&ctx); + return port; } /* @@ -851,13 +1060,13 @@ static void delete_endpoint(void *data) struct cxl_port *parent_port; struct device *parent; - parent_port = cxl_mem_find_port(cxlmd); + parent_port = cxl_mem_find_port(cxlmd, NULL); if (!parent_port) goto out; parent = &parent_port->dev; device_lock(parent); - if (parent->driver && endpoint->uport) { + if (parent->driver && !endpoint->dead) { devm_release_action(parent, cxl_unlink_uport, endpoint); devm_release_action(parent, unregister_port, endpoint); } @@ -883,21 +1092,70 @@ EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, CXL); * for a port to be unregistered is when all memdevs beneath that port have gone * through ->remove(). This "bottom-up" removal selectively removes individual * child ports manually. This depends on devm_cxl_add_port() to not change is - * devm action registration order. + * devm action registration order, and for dports to have already been + * destroyed by reap_dports(). */ -static void delete_switch_port(struct cxl_port *port, struct list_head *dports) +static void delete_switch_port(struct cxl_port *port) +{ + devm_release_action(port->dev.parent, cxl_unlink_uport, port); + devm_release_action(port->dev.parent, unregister_port, port); +} + +static void reap_dports(struct cxl_port *port) { - struct cxl_dport *dport, *_d; + struct cxl_dport *dport; + unsigned long index; + + device_lock_assert(&port->dev); - list_for_each_entry_safe(dport, _d, dports, list) { + xa_for_each(&port->dports, index, dport) { devm_release_action(&port->dev, cxl_dport_unlink, dport); devm_release_action(&port->dev, cxl_dport_remove, dport); devm_kfree(&port->dev, dport); } - devm_release_action(port->dev.parent, cxl_unlink_uport, port); - devm_release_action(port->dev.parent, unregister_port, port); } +int devm_cxl_add_endpoint(struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport) +{ + struct cxl_port *parent_port = parent_dport->port; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_port *endpoint, *iter, *down; + int rc; + + /* + * Now that the path to the root is established record all the + * intervening ports in the chain. + */ + for (iter = parent_port, down = NULL; !is_cxl_root(iter); + down = iter, iter = to_cxl_port(iter->dev.parent)) { + struct cxl_ep *ep; + + ep = cxl_ep_load(iter, cxlmd); + ep->next = down; + } + + endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev, + cxlds->component_reg_phys, parent_dport); + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + + dev_dbg(&cxlmd->dev, "add: %s\n", dev_name(&endpoint->dev)); + + rc = cxl_endpoint_autoremove(cxlmd, endpoint); + if (rc) + return rc; + + if (!endpoint->dev.driver) { + dev_err(&cxlmd->dev, "%s failed probe\n", + dev_name(&endpoint->dev)); + return -ENXIO; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_endpoint, CXL); + static void cxl_detach_ep(void *data) { struct cxl_memdev *cxlmd = data; @@ -906,13 +1164,13 @@ static void cxl_detach_ep(void *data) for (iter = &cxlmd->dev; iter; iter = grandparent(iter)) { struct device *dport_dev = grandparent(iter); struct cxl_port *port, *parent_port; - LIST_HEAD(reap_dports); struct cxl_ep *ep; + bool died = false; if (!dport_dev) break; - port = find_cxl_port(dport_dev); + port = find_cxl_port(dport_dev, NULL); if (!port) continue; @@ -936,26 +1194,27 @@ static void cxl_detach_ep(void *data) } device_lock(&port->dev); - ep = find_ep(port, &cxlmd->dev); + ep = cxl_ep_load(port, cxlmd); dev_dbg(&cxlmd->dev, "disconnect %s from %s\n", ep ? dev_name(ep->ep) : "", dev_name(&port->dev)); - cxl_ep_release(ep); - if (ep && !port->dead && list_empty(&port->endpoints) && + cxl_ep_remove(port, ep); + if (ep && !port->dead && xa_empty(&port->endpoints) && !is_cxl_root(parent_port)) { /* * This was the last ep attached to a dynamically * enumerated port. Block new cxl_add_ep() and garbage * collect the port. */ + died = true; port->dead = true; - list_splice_init(&port->dports, &reap_dports); + reap_dports(port); } device_unlock(&port->dev); - if (!list_empty(&reap_dports)) { + if (died) { dev_dbg(&cxlmd->dev, "delete %s\n", dev_name(&port->dev)); - delete_switch_port(port, &reap_dports); + delete_switch_port(port); } put_device(&port->dev); device_unlock(&parent_port->dev); @@ -986,6 +1245,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, { struct device *dparent = grandparent(dport_dev); struct cxl_port *port, *parent_port = NULL; + struct cxl_dport *dport, *parent_dport; resource_size_t component_reg_phys; int rc; @@ -1000,7 +1260,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, return -ENXIO; } - parent_port = find_cxl_port(dparent); + parent_port = find_cxl_port(dparent, &parent_dport); if (!parent_port) { /* iterate to create this parent_port */ return -EAGAIN; @@ -1015,13 +1275,14 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, goto out; } - port = find_cxl_port_at(parent_port, dport_dev); + port = find_cxl_port_at(parent_port, dport_dev, &dport); if (!port) { component_reg_phys = find_component_registers(uport_dev); port = devm_cxl_add_port(&parent_port->dev, uport_dev, - component_reg_phys, parent_port); + component_reg_phys, parent_dport); + /* retry find to pick up the new dport information */ if (!IS_ERR(port)) - get_device(&port->dev); + port = find_cxl_port_at(parent_port, dport_dev, &dport); } out: device_unlock(&parent_port->dev); @@ -1031,8 +1292,8 @@ out: else { dev_dbg(&cxlmd->dev, "add to new port %s:%s\n", dev_name(&port->dev), dev_name(port->uport)); - rc = cxl_add_ep(port, &cxlmd->dev); - if (rc == -EEXIST) { + rc = cxl_add_ep(dport, &cxlmd->dev); + if (rc == -EBUSY) { /* * "can't" happen, but this error code means * something to the caller, so translate it. @@ -1065,6 +1326,7 @@ retry: for (iter = dev; iter; iter = grandparent(iter)) { struct device *dport_dev = grandparent(iter); struct device *uport_dev; + struct cxl_dport *dport; struct cxl_port *port; if (!dport_dev) @@ -1080,12 +1342,12 @@ retry: dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n", dev_name(iter), dev_name(dport_dev), dev_name(uport_dev)); - port = find_cxl_port(dport_dev); + port = find_cxl_port(dport_dev, &dport); if (port) { dev_dbg(&cxlmd->dev, "found already registered port %s:%s\n", dev_name(&port->dev), dev_name(port->uport)); - rc = cxl_add_ep(port, &cxlmd->dev); + rc = cxl_add_ep(dport, &cxlmd->dev); /* * If the endpoint already exists in the port's list, @@ -1094,7 +1356,7 @@ retry: * the parent_port lock as the current port may be being * reaped. */ - if (rc && rc != -EEXIST) { + if (rc && rc != -EBUSY) { put_device(&port->dev); return rc; } @@ -1124,30 +1386,14 @@ retry: } EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL); -struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd) +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport) { - return find_cxl_port(grandparent(&cxlmd->dev)); + return find_cxl_port(grandparent(&cxlmd->dev), dport); } EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL); -struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port, - const struct device *dev) -{ - struct cxl_dport *dport; - - device_lock(&port->dev); - list_for_each_entry(dport, &port->dports, list) - if (dport->dport == dev) { - device_unlock(&port->dev); - return dport; - } - - device_unlock(&port->dev); - return NULL; -} -EXPORT_SYMBOL_NS_GPL(cxl_find_dport_by_dev, CXL); - -static int decoder_populate_targets(struct cxl_decoder *cxld, +static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, struct cxl_port *port, int *target_map) { int i, rc = 0; @@ -1157,88 +1403,92 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, device_lock_assert(&port->dev); - if (list_empty(&port->dports)) + if (xa_empty(&port->dports)) return -EINVAL; - write_seqlock(&cxld->target_lock); - for (i = 0; i < cxld->nr_targets; i++) { + write_seqlock(&cxlsd->target_lock); + for (i = 0; i < cxlsd->nr_targets; i++) { struct cxl_dport *dport = find_dport(port, target_map[i]); if (!dport) { rc = -ENXIO; break; } - cxld->target[i] = dport; + cxlsd->target[i] = dport; } - write_sequnlock(&cxld->target_lock); + write_sequnlock(&cxlsd->target_lock); return rc; } +static struct cxl_dport *cxl_hb_modulo(struct cxl_root_decoder *cxlrd, int pos) +{ + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw; + + iw = cxld->interleave_ways; + if (dev_WARN_ONCE(&cxld->dev, iw != cxlsd->nr_targets, + "misconfigured root decoder\n")) + return NULL; + + return cxlrd->cxlsd.target[pos % iw]; +} + static struct lock_class_key cxl_decoder_key; /** - * cxl_decoder_alloc - Allocate a new CXL decoder + * cxl_decoder_init - Common decoder setup / initialization * @port: owning port of this decoder - * @nr_targets: downstream targets accessible by this decoder. All upstream - * ports and root ports must have at least 1 target. Endpoint - * devices will have 0 targets. Callers wishing to register an - * endpoint device should specify 0. - * - * A port should contain one or more decoders. Each of those decoders enable - * some address space for CXL.mem utilization. A decoder is expected to be - * configured by the caller before registering. + * @cxld: common decoder properties to initialize * - * Return: A new cxl decoder to be registered by cxl_decoder_add(). The decoder - * is initialized to be a "passthrough" decoder. + * A port may contain one or more decoders. Each of those decoders + * enable some address space for CXL.mem utilization. A decoder is + * expected to be configured by the caller before registering via + * cxl_decoder_add() */ -static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets) +static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld) { - struct cxl_decoder *cxld; struct device *dev; - int rc = 0; - - if (nr_targets > CXL_DECODER_MAX_INTERLEAVE) - return ERR_PTR(-EINVAL); - - cxld = kzalloc(struct_size(cxld, target, nr_targets), GFP_KERNEL); - if (!cxld) - return ERR_PTR(-ENOMEM); + int rc; rc = ida_alloc(&port->decoder_ida, GFP_KERNEL); if (rc < 0) - goto err; + return rc; /* need parent to stick around to release the id */ get_device(&port->dev); cxld->id = rc; - cxld->nr_targets = nr_targets; - seqlock_init(&cxld->target_lock); dev = &cxld->dev; device_initialize(dev); lockdep_set_class(&dev->mutex, &cxl_decoder_key); device_set_pm_not_required(dev); dev->parent = &port->dev; dev->bus = &cxl_bus_type; - if (is_cxl_root(port)) - cxld->dev.type = &cxl_decoder_root_type; - else if (is_cxl_endpoint(port)) - cxld->dev.type = &cxl_decoder_endpoint_type; - else - cxld->dev.type = &cxl_decoder_switch_type; /* Pre initialize an "empty" decoder */ cxld->interleave_ways = 1; cxld->interleave_granularity = PAGE_SIZE; cxld->target_type = CXL_DECODER_EXPANDER; - cxld->platform_res = (struct resource)DEFINE_RES_MEM(0, 0); + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; - return cxld; -err: - kfree(cxld); - return ERR_PTR(rc); + return 0; +} + +static int cxl_switch_decoder_init(struct cxl_port *port, + struct cxl_switch_decoder *cxlsd, + int nr_targets) +{ + if (nr_targets > CXL_DECODER_MAX_INTERLEAVE) + return -EINVAL; + + cxlsd->nr_targets = nr_targets; + seqlock_init(&cxlsd->target_lock); + return cxl_decoder_init(port, &cxlsd->cxld); } /** @@ -1251,13 +1501,46 @@ err: * firmware description of CXL resources into a CXL standard decode * topology. */ -struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets) +struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) { + struct cxl_root_decoder *cxlrd; + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + if (!is_cxl_root(port)) return ERR_PTR(-EINVAL); - return cxl_decoder_alloc(port, nr_targets); + cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets), + GFP_KERNEL); + if (!cxlrd) + return ERR_PTR(-ENOMEM); + + cxlsd = &cxlrd->cxlsd; + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlrd); + return ERR_PTR(rc); + } + + cxlrd->calc_hb = cxl_hb_modulo; + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_root_type; + /* + * cxl_root_decoder_release() special cases negative ids to + * detect memregion_alloc() failures. + */ + atomic_set(&cxlrd->region_id, -1); + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) { + put_device(&cxld->dev); + return ERR_PTR(rc); + } + + atomic_set(&cxlrd->region_id, rc); + return cxlrd; } EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); @@ -1272,13 +1555,29 @@ EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); * that sit between Switch Upstream Ports / Switch Downstream Ports and * Host Bridges / Root Ports. */ -struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets) +struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) { + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + if (is_cxl_root(port) || is_cxl_endpoint(port)) return ERR_PTR(-EINVAL); - return cxl_decoder_alloc(port, nr_targets); + cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL); + if (!cxlsd) + return ERR_PTR(-ENOMEM); + + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlsd); + return ERR_PTR(rc); + } + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_switch_type; + return cxlsd; } EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); @@ -1288,18 +1587,35 @@ EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); * * Return: A new cxl decoder to be registered by cxl_decoder_add() */ -struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) +struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) { + struct cxl_endpoint_decoder *cxled; + struct cxl_decoder *cxld; + int rc; + if (!is_cxl_endpoint(port)) return ERR_PTR(-EINVAL); - return cxl_decoder_alloc(port, 0); + cxled = kzalloc(sizeof(*cxled), GFP_KERNEL); + if (!cxled) + return ERR_PTR(-ENOMEM); + + cxled->pos = -1; + cxld = &cxled->cxld; + rc = cxl_decoder_init(port, cxld); + if (rc) { + kfree(cxled); + return ERR_PTR(rc); + } + + cxld->dev.type = &cxl_decoder_endpoint_type; + return cxled; } EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, CXL); /** * cxl_decoder_add_locked - Add a decoder with targets - * @cxld: The cxl decoder allocated by cxl_decoder_alloc() + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() * @target_map: A list of downstream ports that this decoder can direct memory * traffic to. These numbers should correspond with the port number * in the PCIe Link Capabilities structure. @@ -1335,7 +1651,9 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) port = to_cxl_port(cxld->dev.parent); if (!is_endpoint_decoder(dev)) { - rc = decoder_populate_targets(cxld, port, target_map); + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + rc = decoder_populate_targets(cxlsd, port, target_map); if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) { dev_err(&port->dev, "Failed to populate active decoder targets\n"); @@ -1347,20 +1665,13 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) if (rc) return rc; - /* - * Platform decoder resources should show up with a reasonable name. All - * other resources are just sub ranges within the main decoder resource. - */ - if (is_root_decoder(dev)) - cxld->platform_res.name = dev_name(dev); - return device_add(dev); } EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, CXL); /** * cxl_decoder_add - Add a decoder with targets - * @cxld: The cxl decoder allocated by cxl_decoder_alloc() + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() * @target_map: A list of downstream ports that this decoder can direct memory * traffic to. These numbers should correspond with the port number * in the PCIe Link Capabilities structure. @@ -1394,6 +1705,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL); static void cxld_unregister(void *dev) { + struct cxl_endpoint_decoder *cxled; + + if (is_endpoint_decoder(dev)) { + cxled = to_cxl_endpoint_decoder(dev); + cxl_decoder_kill_region(cxled); + } + device_unregister(dev); } @@ -1521,10 +1839,20 @@ struct bus_type cxl_bus_type = { }; EXPORT_SYMBOL_NS_GPL(cxl_bus_type, CXL); +static struct dentry *cxl_debugfs; + +struct dentry *cxl_debugfs_create_dir(const char *dir) +{ + return debugfs_create_dir(dir, cxl_debugfs); +} +EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, CXL); + static __init int cxl_core_init(void) { int rc; + cxl_debugfs = debugfs_create_dir("cxl", NULL); + cxl_mbox_init(); rc = cxl_memdev_init(); @@ -1541,22 +1869,28 @@ static __init int cxl_core_init(void) if (rc) goto err_bus; + rc = cxl_region_init(); + if (rc) + goto err_region; + return 0; +err_region: + bus_unregister(&cxl_bus_type); err_bus: destroy_workqueue(cxl_bus_wq); err_wq: cxl_memdev_exit(); - cxl_mbox_exit(); return rc; } static void cxl_core_exit(void) { + cxl_region_exit(); bus_unregister(&cxl_bus_type); destroy_workqueue(cxl_bus_wq); cxl_memdev_exit(); - cxl_mbox_exit(); + debugfs_remove_recursive(cxl_debugfs); } module_init(cxl_core_init); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c new file mode 100644 index 000000000000..401148016978 --- /dev/null +++ b/drivers/cxl/core/region.c @@ -0,0 +1,1896 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/memregion.h> +#include <linux/genalloc.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core region + * + * CXL Regions represent mapped memory capacity in system physical address + * space. Whereas the CXL Root Decoders identify the bounds of potential CXL + * Memory ranges, Regions represent the active mapped capacity by the HDM + * Decoder Capability structures throughout the Host Bridges, Switches, and + * Endpoints in the topology. + * + * Region configuration has ordering constraints. UUID may be set at any time + * but is only visible for persistent regions. + * 1. Interleave granularity + * 2. Interleave size + * 3. Decoder targets + */ + +/* + * All changes to the interleave configuration occur with this lock held + * for write. + */ +static DECLARE_RWSEM(cxl_region_rwsem); + +static struct cxl_region *to_cxl_region(struct device *dev); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%pUb\n", &p->uuid); + up_read(&cxl_region_rwsem); + + return rc; +} + +static int is_dup(struct device *match, void *data) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + uuid_t *uuid = data; + + if (!is_cxl_region(match)) + return 0; + + lockdep_assert_held(&cxl_region_rwsem); + cxlr = to_cxl_region(match); + p = &cxlr->params; + + if (uuid_equal(&p->uuid, uuid)) { + dev_dbg(match, "already has uuid: %pUb\n", uuid); + return -EBUSY; + } + + return 0; +} + +static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + uuid_t temp; + ssize_t rc; + + if (len != UUID_STRING_LEN + 1) + return -EINVAL; + + rc = uuid_parse(buf, &temp); + if (rc) + return rc; + + if (uuid_is_null(&temp)) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (uuid_equal(&p->uuid, &temp)) + goto out; + + rc = -EBUSY; + if (p->state >= CXL_CONFIG_ACTIVE) + goto out; + + rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup); + if (rc < 0) + goto out; + + uuid_copy(&p->uuid, &temp); +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(uuid); + +static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port, + struct cxl_region *cxlr) +{ + return xa_load(&port->regions, (unsigned long)cxlr); +} + +static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) +{ + struct cxl_region_params *p = &cxlr->params; + int i; + + for (i = count - 1; i >= 0; i--) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *iter = cxled_to_port(cxled); + struct cxl_ep *ep; + int rc; + + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + rc = cxld->reset(cxld); + if (rc) + return rc; + } + + rc = cxled->cxld.reset(&cxled->cxld); + if (rc) + return rc; + } + + return 0; +} + +static int cxl_region_decode_commit(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i, rc = 0; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + struct cxl_port *iter; + struct cxl_ep *ep; + + /* commit bottom up */ + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + rc = cxld->commit(cxld); + if (rc) + break; + } + + if (rc) { + /* programming @iter failed, teardown */ + for (ep = cxl_ep_load(iter, cxlmd); ep && iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + cxld->reset(cxld); + } + + cxled->cxld.reset(&cxled->cxld); + goto err; + } + } + + return 0; + +err: + /* undo the targets that were successfully committed */ + cxl_region_decode_reset(cxlr, i); + return rc; +} + +static ssize_t commit_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + bool commit; + ssize_t rc; + + rc = kstrtobool(buf, &commit); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + /* Already in the requested state? */ + if (commit && p->state >= CXL_CONFIG_COMMIT) + goto out; + if (!commit && p->state < CXL_CONFIG_COMMIT) + goto out; + + /* Not ready to commit? */ + if (commit && p->state < CXL_CONFIG_ACTIVE) { + rc = -ENXIO; + goto out; + } + + if (commit) + rc = cxl_region_decode_commit(cxlr); + else { + p->state = CXL_CONFIG_RESET_PENDING; + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); + + /* + * The lock was dropped, so need to revalidate that the reset is + * still pending. + */ + if (p->state == CXL_CONFIG_RESET_PENDING) + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + } + + if (rc) + goto out; + + if (commit) + p->state = CXL_CONFIG_COMMIT; + else if (p->state == CXL_CONFIG_RESET_PENDING) + p->state = CXL_CONFIG_ACTIVE; + +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} + +static ssize_t commit_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(commit); + +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + return 0; + return a->mode; +} + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_ways); + up_read(&cxl_region_rwsem); + + return rc; +} + +static const struct attribute_group *get_cxl_region_target_group(void); + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + unsigned int val, save; + int rc; + u8 iw; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + rc = ways_to_cxl(val, &iw); + if (rc) + return rc; + + /* + * Even for x3, x9, and x12 interleaves the region interleave must be a + * power of 2 multiple of the host bridge interleave. + */ + if (!is_power_of_2(val / cxld->interleave_ways) || + (val % cxld->interleave_ways)) { + dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + save = p->interleave_ways; + p->interleave_ways = val; + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); + if (rc) + p->interleave_ways = save; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_ways); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_granularity); + up_read(&cxl_region_rwsem); + + return rc; +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc, val; + u16 ig; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + rc = granularity_to_cxl(val, &ig); + if (rc) + return rc; + + /* + * When the host-bridge is interleaved, disallow region granularity != + * root granularity. Regions with a granularity less than the root + * interleave result in needing multiple endpoints to support a single + * slot in the interleave (possible to suport in the future). Regions + * with a granularity greater than the root interleave result in invalid + * DPA translations (invalid to support). + */ + if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + p->interleave_granularity = val; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_granularity); + +static ssize_t resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 resource = -1ULL; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + resource = p->res->start; + rc = sysfs_emit(buf, "%#llx\n", resource); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RO(resource); + +static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct resource *res; + u32 remainder = 0; + + lockdep_assert_held_write(&cxl_region_rwsem); + + /* Nothing to do... */ + if (p->res && resource_size(p->res) == size) + return 0; + + /* To change size the old size must be freed first */ + if (p->res) + return -EBUSY; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + /* ways, granularity and uuid (if PMEM) need to be set before HPA */ + if (!p->interleave_ways || !p->interleave_granularity || + (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + return -ENXIO; + + div_u64_rem(size, SZ_256M * p->interleave_ways, &remainder); + if (remainder) + return -EINVAL; + + res = alloc_free_mem_region(cxlrd->res, size, SZ_256M, + dev_name(&cxlr->dev)); + if (IS_ERR(res)) { + dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n", + PTR_ERR(res)); + return PTR_ERR(res); + } + + p->res = res; + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + + return 0; +} + +static void cxl_region_iomem_release(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + if (device_is_registered(&cxlr->dev)) + lockdep_assert_held_write(&cxl_region_rwsem); + if (p->res) { + remove_resource(p->res); + kfree(p->res); + p->res = NULL; + } +} + +static int free_hpa(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!p->res) + return 0; + + if (p->state >= CXL_CONFIG_ACTIVE) + return -EBUSY; + + cxl_region_iomem_release(cxlr); + p->state = CXL_CONFIG_IDLE; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + u64 val; + int rc; + + rc = kstrtou64(buf, 0, &val); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (val) + rc = alloc_hpa(cxlr, val); + else + rc = free_hpa(cxlr); + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + + return len; +} + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 size = 0; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + size = resource_size(p->res); + rc = sysfs_emit(buf, "%#llx\n", size); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(size); + +static struct attribute *cxl_region_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_commit.attr, + &dev_attr_interleave_ways.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_resource.attr, + &dev_attr_size.attr, + NULL, +}; + +static const struct attribute_group cxl_region_group = { + .attrs = cxl_region_attrs, + .is_visible = cxl_region_visible, +}; + +static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + int rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + cxled = p->targets[pos]; + if (!cxled) + rc = sysfs_emit(buf, "\n"); + else + rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev)); +out: + up_read(&cxl_region_rwsem); + + return rc; +} + +static int match_free_decoder(struct device *dev, void *data) +{ + struct cxl_decoder *cxld; + int *id = data; + + if (!is_switch_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + + /* enforce ordered allocation */ + if (cxld->id != *id) + return 0; + + if (!cxld->region) + return 1; + + (*id)++; + + return 0; +} + +static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct device *dev; + int id = 0; + + dev = device_find_child(&port->dev, &id, match_free_decoder); + if (!dev) + return NULL; + /* + * This decoder is pinned registered as long as the endpoint decoder is + * registered, and endpoint decoder unregistration holds the + * cxl_region_rwsem over unregister events, so no need to hold on to + * this extra reference. + */ + put_device(dev); + return to_cxl_decoder(dev); +} + +static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_region_ref *cxl_rr, *iter; + unsigned long index; + int rc; + + xa_for_each(&port->regions, index, iter) { + struct cxl_region_params *ip = &iter->region->params; + + if (ip->res->start > p->res->start) { + dev_dbg(&cxlr->dev, + "%s: HPA order violation %s:%pr vs %pr\n", + dev_name(&port->dev), + dev_name(&iter->region->dev), ip->res, p->res); + return ERR_PTR(-EBUSY); + } + } + + cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL); + if (!cxl_rr) + return ERR_PTR(-ENOMEM); + cxl_rr->port = port; + cxl_rr->region = cxlr; + cxl_rr->nr_targets = 1; + xa_init(&cxl_rr->endpoints); + + rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track region reference: %d\n", + dev_name(&port->dev), rc); + kfree(cxl_rr); + return ERR_PTR(rc); + } + + return cxl_rr; +} + +static void free_region_ref(struct cxl_region_ref *cxl_rr) +{ + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + + dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n"); + if (cxld->region == cxlr) { + cxld->region = NULL; + put_device(&cxlr->dev); + } + + xa_erase(&port->regions, (unsigned long)cxlr); + xa_destroy(&cxl_rr->endpoints); + kfree(cxl_rr); +} + +static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr, + struct cxl_endpoint_decoder *cxled) +{ + int rc; + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled)); + + if (ep) { + rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep, + GFP_KERNEL); + if (rc) + return rc; + } + cxl_rr->nr_eps++; + + if (!cxld->region) { + cxld->region = cxlr; + get_device(&cxlr->dev); + } + + return 0; +} + +/** + * cxl_port_attach_region() - track a region's interest in a port by endpoint + * @port: port to add a new region reference 'struct cxl_region_ref' + * @cxlr: region to attach to @port + * @cxled: endpoint decoder used to create or further pin a region reference + * @pos: interleave position of @cxled in @cxlr + * + * The attach event is an opportunity to validate CXL decode setup + * constraints and record metadata needed for programming HDM decoders, + * in particular decoder target lists. + * + * The steps are: + * + * - validate that there are no other regions with a higher HPA already + * associated with @port + * - establish a region reference if one is not already present + * + * - additionally allocate a decoder instance that will host @cxlr on + * @port + * + * - pin the region reference by the endpoint + * - account for how many entries in @port's target list are needed to + * cover all of the added endpoints. + */ +static int cxl_port_attach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_ref *cxl_rr; + bool nr_targets_inc = false; + struct cxl_decoder *cxld; + unsigned long index; + int rc = -EBUSY; + + lockdep_assert_held_write(&cxl_region_rwsem); + + cxl_rr = cxl_rr_load(port, cxlr); + if (cxl_rr) { + struct cxl_ep *ep_iter; + int found = 0; + + /* + * Walk the existing endpoints that have been attached to + * @cxlr at @port and see if they share the same 'next' port + * in the downstream direction. I.e. endpoints that share common + * upstream switch. + */ + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter == ep) + continue; + if (ep_iter->next == ep->next) { + found++; + break; + } + } + + /* + * New target port, or @port is an endpoint port that always + * accounts its own local decode as a target. + */ + if (!found || !ep->next) { + cxl_rr->nr_targets++; + nr_targets_inc = true; + } + + /* + * The decoder for @cxlr was allocated when the region was first + * attached to @port. + */ + cxld = cxl_rr->decoder; + } else { + cxl_rr = alloc_region_ref(port, cxlr); + if (IS_ERR(cxl_rr)) { + dev_dbg(&cxlr->dev, + "%s: failed to allocate region reference\n", + dev_name(&port->dev)); + return PTR_ERR(cxl_rr); + } + nr_targets_inc = true; + + if (port == cxled_to_port(cxled)) + cxld = &cxled->cxld; + else + cxld = cxl_region_find_decoder(port, cxlr); + if (!cxld) { + dev_dbg(&cxlr->dev, "%s: no decoder available\n", + dev_name(&port->dev)); + goto out_erase; + } + + if (cxld->region) { + dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n", + dev_name(&port->dev), dev_name(&cxld->dev), + dev_name(&cxld->region->dev)); + rc = -EBUSY; + goto out_erase; + } + + cxl_rr->decoder = cxld; + } + + rc = cxl_rr_ep_add(cxl_rr, cxled); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track endpoint %s:%s reference\n", + dev_name(&port->dev), dev_name(&cxlmd->dev), + dev_name(&cxld->dev)); + goto out_erase; + } + + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + ep ? ep->next ? dev_name(ep->next->uport) : + dev_name(&cxlmd->dev) : + "none", + cxl_rr->nr_eps, cxl_rr->nr_targets); + + return 0; +out_erase: + if (nr_targets_inc) + cxl_rr->nr_targets--; + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); + return rc; +} + +static void cxl_port_detach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_region_ref *cxl_rr; + struct cxl_ep *ep = NULL; + + lockdep_assert_held_write(&cxl_region_rwsem); + + cxl_rr = cxl_rr_load(port, cxlr); + if (!cxl_rr) + return; + + /* + * Endpoint ports do not carry cxl_ep references, and they + * never target more than one endpoint by definition + */ + if (cxl_rr->decoder == &cxled->cxld) + cxl_rr->nr_eps--; + else + ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled); + if (ep) { + struct cxl_ep *ep_iter; + unsigned long index; + int found = 0; + + cxl_rr->nr_eps--; + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter->next == ep->next) { + found++; + break; + } + } + if (!found) + cxl_rr->nr_targets--; + } + + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); +} + +static int check_last_peer(struct cxl_endpoint_decoder *cxled, + struct cxl_ep *ep, struct cxl_region_ref *cxl_rr, + int distance) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled_peer; + struct cxl_port *port = cxl_rr->port; + struct cxl_memdev *cxlmd_peer; + struct cxl_ep *ep_peer; + int pos = cxled->pos; + + /* + * If this position wants to share a dport with the last endpoint mapped + * then that endpoint, at index 'position - distance', must also be + * mapped by this dport. + */ + if (pos < distance) { + dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + cxled_peer = p->targets[pos - distance]; + cxlmd_peer = cxled_to_memdev(cxled_peer); + ep_peer = cxl_ep_load(port, cxlmd_peer); + if (ep->dport != ep_peer->dport) { + dev_dbg(&cxlr->dev, + "%s:%s: %s:%s pos %d mismatched peer %s:%s\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos, + dev_name(&cxlmd_peer->dev), + dev_name(&cxled_peer->cxld.dev)); + return -ENXIO; + } + + return 0; +} + +static int cxl_port_setup_targets(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos; + struct cxl_port *parent_port = to_cxl_port(port->dev.parent); + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_params *p = &cxlr->params; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_switch_decoder *cxlsd; + u16 eig, peig; + u8 eiw, peiw; + + /* + * While root level decoders support x3, x6, x12, switch level + * decoders only support powers of 2 up to x16. + */ + if (!is_power_of_2(cxl_rr->nr_targets)) { + dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", + dev_name(port->uport), dev_name(&port->dev), + cxl_rr->nr_targets); + return -EINVAL; + } + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets_set) { + int i, distance; + + distance = p->nr_targets / cxl_rr->nr_targets; + for (i = 0; i < cxl_rr->nr_targets_set; i++) + if (ep->dport == cxlsd->target[i]) { + rc = check_last_peer(cxled, ep, cxl_rr, + distance); + if (rc) + return rc; + goto out_target_set; + } + goto add_target; + } + + if (is_cxl_root(parent_port)) { + parent_ig = cxlrd->cxlsd.cxld.interleave_granularity; + parent_iw = cxlrd->cxlsd.cxld.interleave_ways; + /* + * For purposes of address bit routing, use power-of-2 math for + * switch ports. + */ + if (!is_power_of_2(parent_iw)) + parent_iw /= 3; + } else { + struct cxl_region_ref *parent_rr; + struct cxl_decoder *parent_cxld; + + parent_rr = cxl_rr_load(parent_port, cxlr); + parent_cxld = parent_rr->decoder; + parent_ig = parent_cxld->interleave_granularity; + parent_iw = parent_cxld->interleave_ways; + } + + rc = granularity_to_cxl(parent_ig, &peig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", + dev_name(parent_port->uport), + dev_name(&parent_port->dev), parent_ig); + return rc; + } + + rc = ways_to_cxl(parent_iw, &peiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", + dev_name(parent_port->uport), + dev_name(&parent_port->dev), parent_iw); + return rc; + } + + iw = cxl_rr->nr_targets; + rc = ways_to_cxl(iw, &eiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", + dev_name(port->uport), dev_name(&port->dev), iw); + return rc; + } + + /* + * If @parent_port is masking address bits, pick the next unused address + * bit to route @port's targets. + */ + if (parent_iw > 1 && cxl_rr->nr_targets > 1) { + u32 address_bit = max(peig + peiw, eiw + peig); + + eig = address_bit - eiw + 1; + } else { + eiw = peiw; + eig = peig; + } + + rc = cxl_to_granularity(eig, &ig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", + dev_name(port->uport), dev_name(&port->dev), + 256 << eig); + return rc; + } + + cxld->interleave_ways = iw; + cxld->interleave_granularity = ig; + cxld->hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport), + dev_name(&port->dev), iw, ig); +add_target: + if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s: targets full trying to add %s:%s at %d\n", + dev_name(port->uport), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; + inc = 1; +out_target_set: + cxl_rr->nr_targets_set += inc; + dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n", + dev_name(port->uport), dev_name(&port->dev), + cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + + return 0; +} + +static void cxl_port_reset_targets(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_decoder *cxld; + + /* + * After the last endpoint has been detached the entire cxl_rr may now + * be gone. + */ + if (!cxl_rr) + return; + cxl_rr->nr_targets_set = 0; + + cxld = cxl_rr->decoder; + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; +} + +static void cxl_region_teardown_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + int i; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) + cxl_port_reset_targets(iter, cxlr); + } +} + +static int cxl_region_setup_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + int i, rc; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + /* + * Descend the topology tree programming targets while + * looking for conflicts. + */ + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + rc = cxl_port_setup_targets(iter, cxlr, cxled); + if (rc) { + cxl_region_teardown_targets(cxlr); + return rc; + } + } + } + + return 0; +} + +static int cxl_region_attach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *ep_port, *root_port, *iter; + struct cxl_region_params *p = &cxlr->params; + struct cxl_dport *dport; + int i, rc = -ENXIO; + + if (cxled->mode == CXL_DECODER_DEAD) { + dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); + return -ENODEV; + } + + /* all full of members, or interleave config not established? */ + if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "region already active\n"); + return -EBUSY; + } else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "interleave config missing\n"); + return -ENXIO; + } + + if (pos < 0 || pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + return -ENXIO; + } + + if (p->targets[pos] == cxled) + return 0; + + if (p->targets[pos]) { + struct cxl_endpoint_decoder *cxled_target = p->targets[pos]; + struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target); + + dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n", + pos, dev_name(&cxlmd_target->dev), + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + + for (i = 0; i < p->interleave_ways; i++) { + struct cxl_endpoint_decoder *cxled_target; + struct cxl_memdev *cxlmd_target; + + cxled_target = p->targets[pos]; + if (!cxled_target) + continue; + + cxlmd_target = cxled_to_memdev(cxled_target); + if (cxlmd_target == cxlmd) { + dev_dbg(&cxlr->dev, + "%s already specified at position %d via: %s\n", + dev_name(&cxlmd->dev), pos, + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + } + + ep_port = cxled_to_port(cxled); + root_port = cxlrd_to_port(cxlrd); + dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge); + if (!dport) { + dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(cxlr->dev.parent)); + return -ENXIO; + } + + if (cxlrd->calc_hb(cxlrd, pos) != dport) { + dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(&cxlrd->cxlsd.cxld.dev)); + return -ENXIO; + } + + if (cxled->cxld.target_type != cxlr->type) { + dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->cxld.target_type, cxlr->type); + return -ENXIO; + } + + if (!cxled->dpa_res) { + dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev)); + return -ENXIO; + } + + if (resource_size(cxled->dpa_res) * p->interleave_ways != + resource_size(p->res)) { + dev_dbg(&cxlr->dev, + "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + (u64)resource_size(cxled->dpa_res), p->interleave_ways, + (u64)resource_size(p->res)); + return -EINVAL; + } + + for (iter = ep_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + rc = cxl_port_attach_region(iter, cxlr, cxled, pos); + if (rc) + goto err; + } + + p->targets[pos] = cxled; + cxled->pos = pos; + p->nr_targets++; + + if (p->nr_targets == p->interleave_ways) { + rc = cxl_region_setup_targets(cxlr); + if (rc) + goto err_decrement; + p->state = CXL_CONFIG_ACTIVE; + } + + cxled->cxld.interleave_ways = p->interleave_ways; + cxled->cxld.interleave_granularity = p->interleave_granularity; + cxled->cxld.hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + + return 0; + +err_decrement: + p->nr_targets--; +err: + for (iter = ep_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + return rc; +} + +static int cxl_region_detach(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *iter, *ep_port = cxled_to_port(cxled); + struct cxl_region *cxlr = cxled->cxld.region; + struct cxl_region_params *p; + int rc = 0; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!cxlr) + return 0; + + p = &cxlr->params; + get_device(&cxlr->dev); + + if (p->state > CXL_CONFIG_ACTIVE) { + /* + * TODO: tear down all impacted regions if a device is + * removed out of order + */ + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + if (rc) + goto out; + p->state = CXL_CONFIG_ACTIVE; + } + + for (iter = ep_port; !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + + if (cxled->pos < 0 || cxled->pos >= p->interleave_ways || + p->targets[cxled->pos] != cxled) { + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->pos); + goto out; + } + + if (p->state == CXL_CONFIG_ACTIVE) { + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + cxl_region_teardown_targets(cxlr); + } + p->targets[cxled->pos] = NULL; + p->nr_targets--; + cxled->cxld.hpa_range = (struct range) { + .start = 0, + .end = -1, + }; + + /* notify the region driver that one of its targets has departed */ + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); +out: + put_device(&cxlr->dev); + return rc; +} + +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ + down_write(&cxl_region_rwsem); + cxled->mode = CXL_DECODER_DEAD; + cxl_region_detach(cxled); + up_write(&cxl_region_rwsem); +} + +static int attach_target(struct cxl_region *cxlr, const char *decoder, int pos) +{ + struct device *dev; + int rc; + + dev = bus_find_device_by_name(&cxl_bus_type, NULL, decoder); + if (!dev) + return -ENODEV; + + if (!is_endpoint_decoder(dev)) { + put_device(dev); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + goto out; + down_read(&cxl_dpa_rwsem); + rc = cxl_region_attach(cxlr, to_cxl_endpoint_decoder(dev), pos); + up_read(&cxl_dpa_rwsem); + up_write(&cxl_region_rwsem); +out: + put_device(dev); + return rc; +} + +static int detach_target(struct cxl_region *cxlr, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + if (!p->targets[pos]) { + rc = 0; + goto out; + } + + rc = cxl_region_detach(p->targets[pos]); +out: + up_write(&cxl_region_rwsem); + return rc; +} + +static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos, + size_t len) +{ + int rc; + + if (sysfs_streq(buf, "\n")) + rc = detach_target(cxlr, pos); + else + rc = attach_target(cxlr, buf, pos); + + if (rc < 0) + return rc; + return len; +} + +#define TARGET_ATTR_RW(n) \ +static ssize_t target##n##_show( \ + struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + return show_targetN(to_cxl_region(dev), buf, (n)); \ +} \ +static ssize_t target##n##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return store_targetN(to_cxl_region(dev), buf, (n), len); \ +} \ +static DEVICE_ATTR_RW(target##n) + +TARGET_ATTR_RW(0); +TARGET_ATTR_RW(1); +TARGET_ATTR_RW(2); +TARGET_ATTR_RW(3); +TARGET_ATTR_RW(4); +TARGET_ATTR_RW(5); +TARGET_ATTR_RW(6); +TARGET_ATTR_RW(7); +TARGET_ATTR_RW(8); +TARGET_ATTR_RW(9); +TARGET_ATTR_RW(10); +TARGET_ATTR_RW(11); +TARGET_ATTR_RW(12); +TARGET_ATTR_RW(13); +TARGET_ATTR_RW(14); +TARGET_ATTR_RW(15); + +static struct attribute *target_attrs[] = { + &dev_attr_target0.attr, + &dev_attr_target1.attr, + &dev_attr_target2.attr, + &dev_attr_target3.attr, + &dev_attr_target4.attr, + &dev_attr_target5.attr, + &dev_attr_target6.attr, + &dev_attr_target7.attr, + &dev_attr_target8.attr, + &dev_attr_target9.attr, + &dev_attr_target10.attr, + &dev_attr_target11.attr, + &dev_attr_target12.attr, + &dev_attr_target13.attr, + &dev_attr_target14.attr, + &dev_attr_target15.attr, + NULL, +}; + +static umode_t cxl_region_target_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + + if (n < p->interleave_ways) + return a->mode; + return 0; +} + +static const struct attribute_group cxl_region_target_group = { + .attrs = target_attrs, + .is_visible = cxl_region_target_visible, +}; + +static const struct attribute_group *get_cxl_region_target_group(void) +{ + return &cxl_region_target_group; +} + +static const struct attribute_group *region_groups[] = { + &cxl_base_attribute_group, + &cxl_region_group, + &cxl_region_target_group, + NULL, +}; + +static void cxl_region_release(struct device *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + + memregion_free(cxlr->id); + kfree(cxlr); +} + +const struct device_type cxl_region_type = { + .name = "cxl_region", + .release = cxl_region_release, + .groups = region_groups +}; + +bool is_cxl_region(struct device *dev) +{ + return dev->type == &cxl_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL); + +static struct cxl_region *to_cxl_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type, + "not a cxl_region device\n")) + return NULL; + + return container_of(dev, struct cxl_region, dev); +} + +static void unregister_region(void *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + + device_del(dev); + cxl_region_iomem_release(cxlr); + put_device(dev); +} + +static struct lock_class_key cxl_region_key; + +static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) +{ + struct cxl_region *cxlr; + struct device *dev; + + cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL); + if (!cxlr) { + memregion_free(id); + return ERR_PTR(-ENOMEM); + } + + dev = &cxlr->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_region_key); + dev->parent = &cxlrd->cxlsd.cxld.dev; + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_region_type; + cxlr->id = id; + + return cxlr; +} + +/** + * devm_cxl_add_region - Adds a region to a decoder + * @cxlrd: root decoder + * @id: memregion id to create, or memregion_free() on failure + * @mode: mode for the endpoint decoders of this region + * @type: select whether this is an expander or accelerator (type-2 or type-3) + * + * This is the second step of region initialization. Regions exist within an + * address space which is mapped by a @cxlrd. + * + * Return: 0 if the region was added to the @cxlrd, else returns negative error + * code. The region will be named "regionZ" where Z is the unique region number. + */ +static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, + int id, + enum cxl_decoder_mode mode, + enum cxl_decoder_type type) +{ + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); + struct cxl_region *cxlr; + struct device *dev; + int rc; + + cxlr = cxl_region_alloc(cxlrd, id); + if (IS_ERR(cxlr)) + return cxlr; + cxlr->mode = mode; + cxlr->type = type; + + dev = &cxlr->dev; + rc = dev_set_name(dev, "region%d", id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(port->uport, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + + dev_dbg(port->uport, "%s: created %s\n", + dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); + return cxlr; + +err: + put_device(dev); + return ERR_PTR(rc); +} + +static ssize_t create_pmem_region_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id)); +} + +static ssize_t create_pmem_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_region *cxlr; + int id, rc; + + rc = sscanf(buf, "region%d\n", &id); + if (rc != 1) + return -EINVAL; + + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) + return rc; + + if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) { + memregion_free(rc); + return -EBUSY; + } + + cxlr = devm_cxl_add_region(cxlrd, id, CXL_DECODER_PMEM, + CXL_DECODER_EXPANDER); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + return len; +} +DEVICE_ATTR_RW(create_pmem_region); + +static ssize_t region_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (cxld->region) + rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev)); + else + rc = sysfs_emit(buf, "\n"); + up_read(&cxl_region_rwsem); + + return rc; +} +DEVICE_ATTR_RO(region); + +static struct cxl_region * +cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) +{ + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct device *region_dev; + + region_dev = device_find_child_by_name(&cxld->dev, name); + if (!region_dev) + return ERR_PTR(-ENODEV); + + return to_cxl_region(region_dev); +} + +static ssize_t delete_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_region *cxlr; + + cxlr = cxl_find_region_by_name(cxlrd, buf); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + devm_release_action(port->uport, unregister_region, cxlr); + put_device(&cxlr->dev); + + return len; +} +DEVICE_ATTR_WO(delete_region); + +static void cxl_pmem_region_release(struct device *dev) +{ + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + int i; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; + + put_device(&cxlmd->dev); + } + + kfree(cxlr_pmem); +} + +static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_pmem_region_type = { + .name = "cxl_pmem_region", + .release = cxl_pmem_region_release, + .groups = cxl_pmem_region_attribute_groups, +}; + +bool is_cxl_pmem_region(struct device *dev) +{ + return dev->type == &cxl_pmem_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL); + +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), + "not a cxl_pmem_region device\n")) + return NULL; + return container_of(dev, struct cxl_pmem_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL); + +static struct lock_class_key cxl_pmem_region_key; + +static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int i; + + down_read(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) { + cxlr_pmem = ERR_PTR(-ENXIO); + goto out; + } + + cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), + GFP_KERNEL); + if (!cxlr_pmem) { + cxlr_pmem = ERR_PTR(-ENOMEM); + goto out; + } + + cxlr_pmem->hpa_range.start = p->res->start; + cxlr_pmem->hpa_range.end = p->res->end; + + /* Snapshot the region configuration underneath the cxl_region_rwsem */ + cxlr_pmem->nr_mappings = p->nr_targets; + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + + m->cxlmd = cxlmd; + get_device(&cxlmd->dev); + m->start = cxled->dpa_res->start; + m->size = resource_size(cxled->dpa_res); + m->position = i; + } + + dev = &cxlr_pmem->dev; + cxlr_pmem->cxlr = cxlr; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmem_region_type; +out: + up_read(&cxl_region_rwsem); + + return cxlr_pmem; +} + +static void cxlr_pmem_unregister(void *dev) +{ + device_unregister(dev); +} + +/** + * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge + * @cxlr: parent CXL region for this pmem region bridge device + * + * Return: 0 on success negative error code on failure. + */ +static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) +{ + struct cxl_pmem_region *cxlr_pmem; + struct device *dev; + int rc; + + cxlr_pmem = cxl_pmem_region_alloc(cxlr); + if (IS_ERR(cxlr_pmem)) + return PTR_ERR(cxlr_pmem); + + dev = &cxlr_pmem->dev; + rc = dev_set_name(dev, "pmem_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev); + +err: + put_device(dev); + return rc; +} + +static int cxl_region_probe(struct device *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) { + dev_dbg(&cxlr->dev, "probe interrupted\n"); + return rc; + } + + if (p->state < CXL_CONFIG_COMMIT) { + dev_dbg(&cxlr->dev, "config state: %d\n", p->state); + rc = -ENXIO; + } + + /* + * From this point on any path that changes the region's state away from + * CXL_CONFIG_COMMIT is also responsible for releasing the driver. + */ + up_read(&cxl_region_rwsem); + + switch (cxlr->mode) { + case CXL_DECODER_PMEM: + return devm_cxl_add_pmem_region(cxlr); + default: + dev_dbg(&cxlr->dev, "unsupported region mode: %d\n", + cxlr->mode); + return -ENXIO; + } +} + +static struct cxl_driver cxl_region_driver = { + .name = "cxl_region", + .probe = cxl_region_probe, + .id = CXL_DEVICE_REGION, +}; + +int cxl_region_init(void) +{ + return cxl_driver_register(&cxl_region_driver); +} + +void cxl_region_exit(void) +{ + cxl_driver_unregister(&cxl_region_driver); +} + +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_REGION); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6799b27c7db2..f680450f0b16 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -7,6 +7,7 @@ #include <linux/libnvdimm.h> #include <linux/bitfield.h> #include <linux/bitops.h> +#include <linux/log2.h> #include <linux/io.h> /** @@ -53,9 +54,12 @@ #define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) #define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) #define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) #define CXL_HDM_DECODER0_CTRL_TYPE BIT(12) #define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) #define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) +#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i) +#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i) static inline int cxl_hdm_decoder_count(u32 cap_hdr) { @@ -64,6 +68,57 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr) return val ? val * 2 : 1; } +/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ +static inline int cxl_to_granularity(u16 ig, unsigned int *val) +{ + if (ig > 6) + return -EINVAL; + *val = 256 << ig; + return 0; +} + +/* Encode defined in CXL ECN "3, 6, 12 and 16-way memory Interleaving" */ +static inline int cxl_to_ways(u8 eniw, unsigned int *val) +{ + switch (eniw) { + case 0 ... 4: + *val = 1 << eniw; + break; + case 8 ... 10: + *val = 3 << (eniw - 8); + break; + default: + return -EINVAL; + } + + return 0; +} + +static inline int granularity_to_cxl(int g, u16 *ig) +{ + if (g > SZ_16K || g < 256 || !is_power_of_2(g)) + return -EINVAL; + *ig = ilog2(g) - 8; + return 0; +} + +static inline int ways_to_cxl(unsigned int ways, u8 *iw) +{ + if (ways > 16) + return -EINVAL; + if (is_power_of_2(ways)) { + *iw = ilog2(ways); + return 0; + } + if (ways % 3) + return -EINVAL; + ways /= 3; + if (!is_power_of_2(ways)) + return -EINVAL; + *iw = ilog2(ways) + 8; + return 0; +} + /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */ #define CXLDEV_CAP_ARRAY_OFFSET 0x0 #define CXLDEV_CAP_ARRAY_CAP_ID 0 @@ -193,31 +248,77 @@ enum cxl_decoder_type { */ #define CXL_DECODER_MAX_INTERLEAVE 16 +#define CXL_DECODER_MIN_GRANULARITY 256 + /** - * struct cxl_decoder - CXL address range decode configuration + * struct cxl_decoder - Common CXL HDM Decoder Attributes * @dev: this decoder's device * @id: kernel device name id - * @platform_res: address space resources considered by root decoder - * @decoder_range: address space resources considered by midlevel decoder + * @hpa_range: Host physical address range mapped by this decoder * @interleave_ways: number of cxl_dports in this decode * @interleave_granularity: data stride per dport * @target_type: accelerator vs expander (type2 vs type3) selector + * @region: currently assigned region for this decoder * @flags: memory type capabilities and locking - * @target_lock: coordinate coherent reads of the target list - * @nr_targets: number of elements in @target - * @target: active ordered target list in current decoder configuration - */ + * @commit: device/decoder-type specific callback to commit settings to hw + * @reset: device/decoder-type specific callback to reset hw settings +*/ struct cxl_decoder { struct device dev; int id; - union { - struct resource platform_res; - struct range decoder_range; - }; + struct range hpa_range; int interleave_ways; int interleave_granularity; enum cxl_decoder_type target_type; + struct cxl_region *region; unsigned long flags; + int (*commit)(struct cxl_decoder *cxld); + int (*reset)(struct cxl_decoder *cxld); +}; + +/* + * CXL_DECODER_DEAD prevents endpoints from being reattached to regions + * while cxld_unregister() is running + */ +enum cxl_decoder_mode { + CXL_DECODER_NONE, + CXL_DECODER_RAM, + CXL_DECODER_PMEM, + CXL_DECODER_MIXED, + CXL_DECODER_DEAD, +}; + +/** + * struct cxl_endpoint_decoder - Endpoint / SPA to DPA decoder + * @cxld: base cxl_decoder_object + * @dpa_res: actively claimed DPA span of this decoder + * @skip: offset into @dpa_res where @cxld.hpa_range maps + * @mode: which memory type / access-mode-partition this decoder targets + * @pos: interleave position in @cxld.region + */ +struct cxl_endpoint_decoder { + struct cxl_decoder cxld; + struct resource *dpa_res; + resource_size_t skip; + enum cxl_decoder_mode mode; + int pos; +}; + +/** + * struct cxl_switch_decoder - Switch specific CXL HDM Decoder + * @cxld: base cxl_decoder object + * @target_lock: coordinate coherent reads of the target list + * @nr_targets: number of elements in @target + * @target: active ordered target list in current decoder configuration + * + * The 'switch' decoder type represents the decoder instances of cxl_port's that + * route from the root of a CXL memory decode topology to the endpoints. They + * come in two flavors, root-level decoders, statically defined by platform + * firmware, and mid-level decoders, where interleave-granularity, + * interleave-width, and the target list are mutable. + */ +struct cxl_switch_decoder { + struct cxl_decoder cxld; seqlock_t target_lock; int nr_targets; struct cxl_dport *target[]; @@ -225,6 +326,76 @@ struct cxl_decoder { /** + * struct cxl_root_decoder - Static platform CXL address decoder + * @res: host / parent resource for region allocations + * @region_id: region id for next region provisioning event + * @calc_hb: which host bridge covers the n'th position by granularity + * @cxlsd: base cxl switch decoder + */ +struct cxl_root_decoder { + struct resource *res; + atomic_t region_id; + struct cxl_dport *(*calc_hb)(struct cxl_root_decoder *cxlrd, int pos); + struct cxl_switch_decoder cxlsd; +}; + +/* + * enum cxl_config_state - State machine for region configuration + * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely + * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more + * changes to interleave_ways or interleave_granularity + * @CXL_CONFIG_ACTIVE: All targets have been added the region is now + * active + * @CXL_CONFIG_RESET_PENDING: see commit_store() + * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware + */ +enum cxl_config_state { + CXL_CONFIG_IDLE, + CXL_CONFIG_INTERLEAVE_ACTIVE, + CXL_CONFIG_ACTIVE, + CXL_CONFIG_RESET_PENDING, + CXL_CONFIG_COMMIT, +}; + +/** + * struct cxl_region_params - region settings + * @state: allow the driver to lockdown further parameter changes + * @uuid: unique id for persistent regions + * @interleave_ways: number of endpoints in the region + * @interleave_granularity: capacity each endpoint contributes to a stripe + * @res: allocated iomem capacity for this region + * @targets: active ordered targets in current decoder configuration + * @nr_targets: number of targets + * + * State transitions are protected by the cxl_region_rwsem + */ +struct cxl_region_params { + enum cxl_config_state state; + uuid_t uuid; + int interleave_ways; + int interleave_granularity; + struct resource *res; + struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE]; + int nr_targets; +}; + +/** + * struct cxl_region - CXL region + * @dev: This region's device + * @id: This region's id. Id is globally unique across all regions + * @mode: Endpoint decoder allocation / access mode + * @type: Endpoint decoder target type + * @params: active + config params for the region + */ +struct cxl_region { + struct device dev; + int id; + enum cxl_decoder_mode mode; + enum cxl_decoder_type type; + struct cxl_region_params params; +}; + +/** * enum cxl_nvdimm_brige_state - state machine for managing bus rescans * @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed * @CXL_NVB_DEAD: Set at brige unregistration to preclude async probing @@ -251,7 +422,26 @@ struct cxl_nvdimm_bridge { struct cxl_nvdimm { struct device dev; struct cxl_memdev *cxlmd; - struct nvdimm *nvdimm; + struct cxl_nvdimm_bridge *bridge; + struct cxl_pmem_region *region; +}; + +struct cxl_pmem_region_mapping { + struct cxl_memdev *cxlmd; + struct cxl_nvdimm *cxl_nvd; + u64 start; + u64 size; + int position; +}; + +struct cxl_pmem_region { + struct device dev; + struct cxl_region *cxlr; + struct nd_region *nd_region; + struct cxl_nvdimm_bridge *bridge; + struct range hpa_range; + int nr_mappings; + struct cxl_pmem_region_mapping mapping[]; }; /** @@ -260,50 +450,94 @@ struct cxl_nvdimm { * decode hierarchy. * @dev: this port's device * @uport: PCI or platform device implementing the upstream port capability + * @host_bridge: Shortcut to the platform attach point for this port * @id: id for port device-name * @dports: cxl_dport instances referenced by decoders * @endpoints: cxl_ep instances, endpoints that are a descendant of this port + * @regions: cxl_region_ref instances, regions mapped by this port + * @parent_dport: dport that points to this port in the parent * @decoder_ida: allocator for decoder ids + * @hdm_end: track last allocated HDM decoder instance for allocation ordering + * @commit_end: cursor to track highest committed decoder for commit ordering * @component_reg_phys: component register capability base address (optional) * @dead: last ep has been removed, force port re-creation * @depth: How deep this port is relative to the root. depth 0 is the root. + * @cdat: Cached CDAT data + * @cdat_available: Should a CDAT attribute be available in sysfs */ struct cxl_port { struct device dev; struct device *uport; + struct device *host_bridge; int id; - struct list_head dports; - struct list_head endpoints; + struct xarray dports; + struct xarray endpoints; + struct xarray regions; + struct cxl_dport *parent_dport; struct ida decoder_ida; + int hdm_end; + int commit_end; resource_size_t component_reg_phys; bool dead; unsigned int depth; + struct cxl_cdat { + void *table; + size_t length; + } cdat; + bool cdat_available; }; +static inline struct cxl_dport * +cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev) +{ + return xa_load(&port->dports, (unsigned long)dport_dev); +} + /** * struct cxl_dport - CXL downstream port * @dport: PCI bridge or firmware device representing the downstream link * @port_id: unique hardware identifier for dport in decoder target list * @component_reg_phys: downstream port component registers * @port: reference to cxl_port that contains this downstream port - * @list: node for a cxl_port's list of cxl_dport instances */ struct cxl_dport { struct device *dport; int port_id; resource_size_t component_reg_phys; struct cxl_port *port; - struct list_head list; }; /** * struct cxl_ep - track an endpoint's interest in a port * @ep: device that hosts a generic CXL endpoint (expander or accelerator) - * @list: node on port->endpoints list + * @dport: which dport routes to this endpoint on @port + * @next: cxl switch port across the link attached to @dport NULL if + * attached to an endpoint */ struct cxl_ep { struct device *ep; - struct list_head list; + struct cxl_dport *dport; + struct cxl_port *next; +}; + +/** + * struct cxl_region_ref - track a region's interest in a port + * @port: point in topology to install this reference + * @decoder: decoder assigned for @region in @port + * @region: region for this reference + * @endpoints: cxl_ep references for region members beneath @port + * @nr_targets_set: track how many targets have been programmed during setup + * @nr_eps: number of endpoints beneath @port + * @nr_targets: number of distinct targets needed to reach @nr_eps + */ +struct cxl_region_ref { + struct cxl_port *port; + struct cxl_decoder *decoder; + struct cxl_region *region; + struct xarray endpoints; + int nr_targets_set; + int nr_eps; + int nr_targets; }; /* @@ -325,29 +559,31 @@ int devm_cxl_register_pci_bus(struct device *host, struct device *uport, struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port); struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, - struct cxl_port *parent_port); + struct cxl_dport *parent_dport); +int devm_cxl_add_endpoint(struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport); struct cxl_port *find_cxl_root(struct device *dev); int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); int cxl_bus_rescan(void); -struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd); +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport); bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd); struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, resource_size_t component_reg_phys); -struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port, - const struct device *dev); struct cxl_decoder *to_cxl_decoder(struct device *dev); +struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); +struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev); bool is_root_decoder(struct device *dev); bool is_endpoint_decoder(struct device *dev); -bool is_cxl_decoder(struct device *dev); -struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets); -struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, - unsigned int nr_targets); +struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); +struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); -struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port); +struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port); int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint); @@ -357,6 +593,8 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port); int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm); int devm_cxl_add_passthrough_decoder(struct cxl_port *port); +bool is_cxl_region(struct device *dev); + extern struct bus_type cxl_bus_type; struct cxl_driver { @@ -385,6 +623,8 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); #define CXL_DEVICE_PORT 3 #define CXL_DEVICE_ROOT 4 #define CXL_DEVICE_MEMORY_EXPANDER 5 +#define CXL_DEVICE_REGION 6 +#define CXL_DEVICE_PMEM_REGION 7 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") #define CXL_MODALIAS_FMT "cxl:t%d" @@ -396,7 +636,21 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm_bridge(struct device *dev); int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd); -struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd); +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev); + +#ifdef CONFIG_CXL_REGION +bool is_cxl_pmem_region(struct device *dev); +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev); +#else +static inline bool is_cxl_pmem_region(struct device *dev) +{ + return false; +} +static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + return NULL; +} +#endif /* * Unit test builds overrides this to __weak, find the 'strong' version diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 7df0b053373a..88e3a8e54b6a 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -50,6 +50,24 @@ static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) return container_of(dev, struct cxl_memdev, dev); } +static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled) +{ + return to_cxl_port(cxled->cxld.dev.parent); +} + +static inline struct cxl_port *cxlrd_to_port(struct cxl_root_decoder *cxlrd) +{ + return to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); +} + +static inline struct cxl_memdev * +cxled_to_memdev(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent); + + return to_cxl_memdev(port->uport); +} + bool is_cxl_memdev(struct device *dev); static inline bool is_cxl_endpoint(struct cxl_port *port) { @@ -178,8 +196,9 @@ struct cxl_endpoint_dvsec_info { * @firmware_version: Firmware version for the memory device. * @enabled_cmds: Hardware commands found enabled in CEL. * @exclusive_cmds: Commands that are kernel-internal only - * @pmem_range: Active Persistent memory capacity configuration - * @ram_range: Active Volatile memory capacity configuration + * @dpa_res: Overall DPA resource tree for the device + * @pmem_res: Active Persistent memory capacity configuration + * @ram_res: Active Volatile memory capacity configuration * @total_bytes: sum of all possible capacities * @volatile_only_bytes: hard volatile capacity * @persistent_only_bytes: hard persistent capacity @@ -191,6 +210,7 @@ struct cxl_endpoint_dvsec_info { * @component_reg_phys: register base of component registers * @info: Cached DVSEC information about the device. * @serial: PCIe Device Serial Number + * @doe_mbs: PCI DOE mailbox array * @mbox_send: @dev specific transport for transmitting mailbox commands * * See section 8.2.9.5.2 Capacity Configuration and Label Storage for @@ -209,8 +229,9 @@ struct cxl_dev_state { DECLARE_BITMAP(enabled_cmds, CXL_MEM_COMMAND_ID_MAX); DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX); - struct range pmem_range; - struct range ram_range; + struct resource dpa_res; + struct resource pmem_res; + struct resource ram_res; u64 total_bytes; u64 volatile_only_bytes; u64 persistent_only_bytes; @@ -224,6 +245,8 @@ struct cxl_dev_state { resource_size_t component_reg_phys; u64 serial; + struct xarray doe_mbs; + int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); }; @@ -299,6 +322,13 @@ struct cxl_mbox_identify { u8 qos_telemetry_caps; } __packed; +struct cxl_mbox_get_partition_info { + __le64 active_volatile_cap; + __le64 active_persistent_cap; + __le64 next_volatile_cap; + __le64 next_persistent_cap; +} __packed; + struct cxl_mbox_get_lsa { __le32 offset; __le32 length; @@ -370,4 +400,8 @@ struct cxl_hdm { unsigned int interleave_mask; struct cxl_port *port; }; + +struct seq_file; +struct dentry *cxl_debugfs_create_dir(const char *dir); +void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds); #endif /* __CXL_MEM_H__ */ diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index fce1c11729c2..eec597dbe763 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -74,4 +74,5 @@ static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm); +void read_cdat_data(struct cxl_port *port); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index a979d0b484d5..64ccf053d32c 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/debugfs.h> #include <linux/device.h> #include <linux/module.h> #include <linux/pci.h> @@ -24,42 +25,32 @@ * in higher level operations. */ -static int create_endpoint(struct cxl_memdev *cxlmd, - struct cxl_port *parent_port) +static void enable_suspend(void *data) { - struct cxl_dev_state *cxlds = cxlmd->cxlds; - struct cxl_port *endpoint; - int rc; + cxl_mem_active_dec(); +} - endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev, - cxlds->component_reg_phys, parent_port); - if (IS_ERR(endpoint)) - return PTR_ERR(endpoint); +static void remove_debugfs(void *dentry) +{ + debugfs_remove_recursive(dentry); +} - dev_dbg(&cxlmd->dev, "add: %s\n", dev_name(&endpoint->dev)); +static int cxl_mem_dpa_show(struct seq_file *file, void *data) +{ + struct device *dev = file->private; + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); - rc = cxl_endpoint_autoremove(cxlmd, endpoint); - if (rc) - return rc; - - if (!endpoint->dev.driver) { - dev_err(&cxlmd->dev, "%s failed probe\n", - dev_name(&endpoint->dev)); - return -ENXIO; - } + cxl_dpa_debug(file, cxlmd->cxlds); return 0; } -static void enable_suspend(void *data) -{ - cxl_mem_active_dec(); -} - static int cxl_mem_probe(struct device *dev) { struct cxl_memdev *cxlmd = to_cxl_memdev(dev); struct cxl_port *parent_port; + struct cxl_dport *dport; + struct dentry *dentry; int rc; /* @@ -73,11 +64,17 @@ static int cxl_mem_probe(struct device *dev) if (work_pending(&cxlmd->detach_work)) return -EBUSY; + dentry = cxl_debugfs_create_dir(dev_name(dev)); + debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show); + rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); + if (rc) + return rc; + rc = devm_cxl_enumerate_ports(cxlmd); if (rc) return rc; - parent_port = cxl_mem_find_port(cxlmd); + parent_port = cxl_mem_find_port(cxlmd, &dport); if (!parent_port) { dev_err(dev, "CXL port topology not found\n"); return -ENXIO; @@ -91,7 +88,7 @@ static int cxl_mem_probe(struct device *dev) goto unlock; } - rc = create_endpoint(cxlmd, parent_port); + rc = devm_cxl_add_endpoint(cxlmd, dport); unlock: device_unlock(&parent_port->dev); put_device(&parent_port->dev); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 5a0ae46d4989..faeb5d9d7a7a 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/pci.h> +#include <linux/pci-doe.h> #include <linux/io.h> #include "cxlmem.h" #include "cxlpci.h" @@ -386,6 +387,47 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, return rc; } +static void cxl_pci_destroy_doe(void *mbs) +{ + xa_destroy(mbs); +} + +static void devm_cxl_pci_create_doe(struct cxl_dev_state *cxlds) +{ + struct device *dev = cxlds->dev; + struct pci_dev *pdev = to_pci_dev(dev); + u16 off = 0; + + xa_init(&cxlds->doe_mbs); + if (devm_add_action(&pdev->dev, cxl_pci_destroy_doe, &cxlds->doe_mbs)) { + dev_err(dev, "Failed to create XArray for DOE's\n"); + return; + } + + /* + * Mailbox creation is best effort. Higher layers must determine if + * the lack of a mailbox for their protocol is a device failure or not. + */ + pci_doe_for_each_off(pdev, off) { + struct pci_doe_mb *doe_mb; + + doe_mb = pcim_doe_create_mb(pdev, off); + if (IS_ERR(doe_mb)) { + dev_err(dev, "Failed to create MB object for MB @ %x\n", + off); + continue; + } + + if (xa_insert(&cxlds->doe_mbs, off, doe_mb, GFP_KERNEL)) { + dev_err(dev, "xa_insert failed to insert MB @ %x\n", + off); + continue; + } + + dev_dbg(dev, "Created DOE mailbox @%x\n", off); + } +} + static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct cxl_register_map map; @@ -434,6 +476,8 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map); + devm_cxl_pci_create_doe(cxlds); + rc = cxl_pci_setup_mailbox(cxlds); if (rc) return rc; @@ -454,7 +498,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); - if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM)) + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd); return rc; diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index 0aaa70b4e0f7..7dc0a2fa1a6b 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -7,6 +7,7 @@ #include <linux/ndctl.h> #include <linux/async.h> #include <linux/slab.h> +#include <linux/nd.h> #include "cxlmem.h" #include "cxl.h" @@ -26,7 +27,23 @@ static void clear_exclusive(void *cxlds) static void unregister_nvdimm(void *nvdimm) { + struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm); + struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge; + struct cxl_pmem_region *cxlr_pmem; + + device_lock(&cxl_nvb->dev); + cxlr_pmem = cxl_nvd->region; + dev_set_drvdata(&cxl_nvd->dev, NULL); + cxl_nvd->region = NULL; + device_unlock(&cxl_nvb->dev); + + if (cxlr_pmem) { + device_release_driver(&cxlr_pmem->dev); + put_device(&cxlr_pmem->dev); + } + nvdimm_delete(nvdimm); + cxl_nvd->bridge = NULL; } static int cxl_nvdimm_probe(struct device *dev) @@ -39,7 +56,7 @@ static int cxl_nvdimm_probe(struct device *dev) struct nvdimm *nvdimm; int rc; - cxl_nvb = cxl_find_nvdimm_bridge(cxl_nvd); + cxl_nvb = cxl_find_nvdimm_bridge(dev); if (!cxl_nvb) return -ENXIO; @@ -66,6 +83,7 @@ static int cxl_nvdimm_probe(struct device *dev) } dev_set_drvdata(dev, nvdimm); + cxl_nvd->bridge = cxl_nvb; rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm); out: device_unlock(&cxl_nvb->dev); @@ -204,15 +222,38 @@ static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb) return cxl_nvb->nvdimm_bus != NULL; } -static int cxl_nvdimm_release_driver(struct device *dev, void *data) +static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb) { + struct cxl_nvdimm *cxl_nvd; + if (!is_cxl_nvdimm(dev)) return 0; + + cxl_nvd = to_cxl_nvdimm(dev); + if (cxl_nvd->bridge != cxl_nvb) + return 0; + device_release_driver(dev); return 0; } -static void offline_nvdimm_bus(struct nvdimm_bus *nvdimm_bus) +static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb) +{ + struct cxl_pmem_region *cxlr_pmem; + + if (!is_cxl_pmem_region(dev)) + return 0; + + cxlr_pmem = to_cxl_pmem_region(dev); + if (cxlr_pmem->bridge != cxl_nvb) + return 0; + + device_release_driver(dev); + return 0; +} + +static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb, + struct nvdimm_bus *nvdimm_bus) { if (!nvdimm_bus) return; @@ -222,7 +263,10 @@ static void offline_nvdimm_bus(struct nvdimm_bus *nvdimm_bus) * nvdimm_bus_unregister() rips the nvdimm objects out from * underneath them. */ - bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_release_driver); + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, + cxl_pmem_region_release_driver); + bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, + cxl_nvdimm_release_driver); nvdimm_bus_unregister(nvdimm_bus); } @@ -260,7 +304,7 @@ static void cxl_nvb_update_state(struct work_struct *work) dev_dbg(&cxl_nvb->dev, "rescan: %d\n", rc); } - offline_nvdimm_bus(victim_bus); + offline_nvdimm_bus(cxl_nvb, victim_bus); put_device(&cxl_nvb->dev); } @@ -315,6 +359,203 @@ static struct cxl_driver cxl_nvdimm_bridge_driver = { .id = CXL_DEVICE_NVDIMM_BRIDGE, }; +static int match_cxl_nvdimm(struct device *dev, void *data) +{ + return is_cxl_nvdimm(dev); +} + +static void unregister_nvdimm_region(void *nd_region) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct cxl_pmem_region *cxlr_pmem; + int i; + + cxlr_pmem = nd_region_provider_data(nd_region); + cxl_nvb = cxlr_pmem->bridge; + device_lock(&cxl_nvb->dev); + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_nvdimm *cxl_nvd = m->cxl_nvd; + + if (cxl_nvd->region) { + put_device(&cxlr_pmem->dev); + cxl_nvd->region = NULL; + } + } + device_unlock(&cxl_nvb->dev); + + nvdimm_region_delete(nd_region); +} + +static void cxlr_pmem_remove_resource(void *res) +{ + remove_resource(res); +} + +struct cxl_pmem_region_info { + u64 offset; + u64 serial; +}; + +static int cxl_pmem_region_probe(struct device *dev) +{ + struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE]; + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + struct cxl_region *cxlr = cxlr_pmem->cxlr; + struct cxl_pmem_region_info *info = NULL; + struct cxl_nvdimm_bridge *cxl_nvb; + struct nd_interleave_set *nd_set; + struct nd_region_desc ndr_desc; + struct cxl_nvdimm *cxl_nvd; + struct nvdimm *nvdimm; + struct resource *res; + int rc, i = 0; + + cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev); + if (!cxl_nvb) { + dev_dbg(dev, "bridge not found\n"); + return -ENXIO; + } + cxlr_pmem->bridge = cxl_nvb; + + device_lock(&cxl_nvb->dev); + if (!cxl_nvb->nvdimm_bus) { + dev_dbg(dev, "nvdimm bus not found\n"); + rc = -ENXIO; + goto err; + } + + memset(&mappings, 0, sizeof(mappings)); + memset(&ndr_desc, 0, sizeof(ndr_desc)); + + res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL); + if (!res) { + rc = -ENOMEM; + goto err; + } + + res->name = "Persistent Memory"; + res->start = cxlr_pmem->hpa_range.start; + res->end = cxlr_pmem->hpa_range.end; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_PERSISTENT_MEMORY; + + rc = insert_resource(&iomem_resource, res); + if (rc) + goto err; + + rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res); + if (rc) + goto err; + + ndr_desc.res = res; + ndr_desc.provider_data = cxlr_pmem; + + ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start); + ndr_desc.target_node = phys_to_target_node(res->start); + if (ndr_desc.target_node == NUMA_NO_NODE) { + ndr_desc.target_node = ndr_desc.numa_node; + dev_dbg(&cxlr->dev, "changing target node from %d to %d", + NUMA_NO_NODE, ndr_desc.target_node); + } + + nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) { + rc = -ENOMEM; + goto err; + } + + ndr_desc.memregion = cxlr->id; + set_bit(ND_REGION_CXL, &ndr_desc.flags); + set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags); + + info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto err; + } + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + struct cxl_memdev *cxlmd = m->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *d; + + d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm); + if (!d) { + dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto err; + } + + /* safe to drop ref now with bridge lock held */ + put_device(d); + + cxl_nvd = to_cxl_nvdimm(d); + nvdimm = dev_get_drvdata(&cxl_nvd->dev); + if (!nvdimm) { + dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i, + dev_name(&cxlmd->dev)); + rc = -ENODEV; + goto err; + } + cxl_nvd->region = cxlr_pmem; + get_device(&cxlr_pmem->dev); + m->cxl_nvd = cxl_nvd; + mappings[i] = (struct nd_mapping_desc) { + .nvdimm = nvdimm, + .start = m->start, + .size = m->size, + .position = i, + }; + info[i].offset = m->start; + info[i].serial = cxlds->serial; + } + ndr_desc.num_mappings = cxlr_pmem->nr_mappings; + ndr_desc.mapping = mappings; + + /* + * TODO enable CXL labels which skip the need for 'interleave-set cookie' + */ + nd_set->cookie1 = + nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0); + nd_set->cookie2 = nd_set->cookie1; + ndr_desc.nd_set = nd_set; + + cxlr_pmem->nd_region = + nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc); + if (!cxlr_pmem->nd_region) { + rc = -ENOMEM; + goto err; + } + + rc = devm_add_action_or_reset(dev, unregister_nvdimm_region, + cxlr_pmem->nd_region); +out: + kfree(info); + device_unlock(&cxl_nvb->dev); + put_device(&cxl_nvb->dev); + + return rc; + +err: + dev_dbg(dev, "failed to create nvdimm region\n"); + for (i--; i >= 0; i--) { + nvdimm = mappings[i].nvdimm; + cxl_nvd = nvdimm_provider_data(nvdimm); + put_device(&cxl_nvd->region->dev); + cxl_nvd->region = NULL; + } + goto out; +} + +static struct cxl_driver cxl_pmem_region_driver = { + .name = "cxl_pmem_region", + .probe = cxl_pmem_region_probe, + .id = CXL_DEVICE_PMEM_REGION, +}; + /* * Return all bridges to the CXL_NVB_NEW state to invalidate any * ->state_work referring to the now destroyed cxl_pmem_wq. @@ -359,8 +600,14 @@ static __init int cxl_pmem_init(void) if (rc) goto err_nvdimm; + rc = cxl_driver_register(&cxl_pmem_region_driver); + if (rc) + goto err_region; + return 0; +err_region: + cxl_driver_unregister(&cxl_nvdimm_driver); err_nvdimm: cxl_driver_unregister(&cxl_nvdimm_bridge_driver); err_bridge: @@ -370,6 +617,7 @@ err_bridge: static __exit void cxl_pmem_exit(void) { + cxl_driver_unregister(&cxl_pmem_region_driver); cxl_driver_unregister(&cxl_nvdimm_driver); cxl_driver_unregister(&cxl_nvdimm_bridge_driver); destroy_cxl_pmem_wq(); @@ -381,3 +629,4 @@ module_exit(cxl_pmem_exit); MODULE_IMPORT_NS(CXL); MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE); MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM); +MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 3cf308f114c4..5453771bf330 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -53,6 +53,9 @@ static int cxl_port_probe(struct device *dev) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport); struct cxl_dev_state *cxlds = cxlmd->cxlds; + /* Cache the data early to ensure is_visible() works */ + read_cdat_data(port); + get_device(&cxlmd->dev); rc = devm_add_action_or_reset(dev, schedule_detach, cxlmd); if (rc) @@ -78,10 +81,60 @@ static int cxl_port_probe(struct device *dev) return 0; } +static ssize_t CDAT_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t offset, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if (!port->cdat_available) + return -ENXIO; + + if (!port->cdat.table) + return 0; + + return memory_read_from_buffer(buf, count, &offset, + port->cdat.table, + port->cdat.length); +} + +static BIN_ATTR_ADMIN_RO(CDAT, 0); + +static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj, + struct bin_attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if ((attr == &bin_attr_CDAT) && port->cdat_available) + return attr->attr.mode; + + return 0; +} + +static struct bin_attribute *cxl_cdat_bin_attributes[] = { + &bin_attr_CDAT, + NULL, +}; + +static struct attribute_group cxl_cdat_attribute_group = { + .bin_attrs = cxl_cdat_bin_attributes, + .is_bin_visible = cxl_port_bin_attr_is_visible, +}; + +static const struct attribute_group *cxl_port_attribute_groups[] = { + &cxl_cdat_attribute_group, + NULL, +}; + static struct cxl_driver cxl_port_driver = { .name = "cxl_port", .probe = cxl_port_probe, .id = CXL_DEVICE_PORT, + .drv = { + .dev_groups = cxl_port_attribute_groups, + }, }; module_cxl_driver(cxl_port_driver); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index d976260eca7a..473a71bbd9c9 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -133,7 +133,8 @@ static void nd_region_release(struct device *dev) put_device(&nvdimm->dev); } free_percpu(nd_region->lane); - memregion_free(nd_region->id); + if (!test_bit(ND_REGION_CXL, &nd_region->flags)) + memregion_free(nd_region->id); kfree(nd_region); } @@ -982,9 +983,14 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, if (!nd_region) return NULL; - nd_region->id = memregion_alloc(GFP_KERNEL); - if (nd_region->id < 0) - goto err_id; + /* CXL pre-assigns memregion ids before creating nvdimm regions */ + if (test_bit(ND_REGION_CXL, &ndr_desc->flags)) { + nd_region->id = ndr_desc->memregion; + } else { + nd_region->id = memregion_alloc(GFP_KERNEL); + if (nd_region->id < 0) + goto err_id; + } nd_region->lane = alloc_percpu(struct nd_percpu_lane); if (!nd_region->lane) @@ -1043,9 +1049,10 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, return nd_region; - err_percpu: - memregion_free(nd_region->id); - err_id: +err_percpu: + if (!test_bit(ND_REGION_CXL, &ndr_desc->flags)) + memregion_free(nd_region->id); +err_id: kfree(nd_region); return NULL; } @@ -1068,6 +1075,13 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, } EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); +void nvdimm_region_delete(struct nd_region *nd_region) +{ + if (nd_region) + nd_device_unregister(&nd_region->dev, ND_SYNC); +} +EXPORT_SYMBOL_GPL(nvdimm_region_delete); + int nvdimm_flush(struct nd_region *nd_region, struct bio *bio) { int rc = 0; diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 5cc7cba1941f..55c028af4bd9 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -121,6 +121,9 @@ config XEN_PCIDEV_FRONTEND config PCI_ATS bool +config PCI_DOE + bool + config PCI_ECAM bool diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile index 0da6b1ebc694..2680e4c92f0a 100644 --- a/drivers/pci/Makefile +++ b/drivers/pci/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_PCI_ECAM) += ecam.o obj-$(CONFIG_PCI_P2PDMA) += p2pdma.o obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o obj-$(CONFIG_VGA_ARB) += vgaarb.o +obj-$(CONFIG_PCI_DOE) += doe.o # Endpoint library must be initialized before its users obj-$(CONFIG_PCI_ENDPOINT) += endpoint/ diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c new file mode 100644 index 000000000000..e402f05068a5 --- /dev/null +++ b/drivers/pci/doe.c @@ -0,0 +1,536 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Object Exchange + * PCIe r6.0, sec 6.30 DOE + * + * Copyright (C) 2021 Huawei + * Jonathan Cameron <Jonathan.Cameron@huawei.com> + * + * Copyright (C) 2022 Intel Corporation + * Ira Weiny <ira.weiny@intel.com> + */ + +#define dev_fmt(fmt) "DOE: " fmt + +#include <linux/bitfield.h> +#include <linux/delay.h> +#include <linux/jiffies.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/pci-doe.h> +#include <linux/workqueue.h> + +#define PCI_DOE_PROTOCOL_DISCOVERY 0 + +/* Timeout of 1 second from 6.30.2 Operation, PCI Spec r6.0 */ +#define PCI_DOE_TIMEOUT HZ +#define PCI_DOE_POLL_INTERVAL (PCI_DOE_TIMEOUT / 128) + +#define PCI_DOE_FLAG_CANCEL 0 +#define PCI_DOE_FLAG_DEAD 1 + +/** + * struct pci_doe_mb - State for a single DOE mailbox + * + * This state is used to manage a single DOE mailbox capability. All fields + * should be considered opaque to the consumers and the structure passed into + * the helpers below after being created by devm_pci_doe_create() + * + * @pdev: PCI device this mailbox belongs to + * @cap_offset: Capability offset + * @prots: Array of protocols supported (encoded as long values) + * @wq: Wait queue for work item + * @work_queue: Queue of pci_doe_work items + * @flags: Bit array of PCI_DOE_FLAG_* flags + */ +struct pci_doe_mb { + struct pci_dev *pdev; + u16 cap_offset; + struct xarray prots; + + wait_queue_head_t wq; + struct workqueue_struct *work_queue; + unsigned long flags; +}; + +static int pci_doe_wait(struct pci_doe_mb *doe_mb, unsigned long timeout) +{ + if (wait_event_timeout(doe_mb->wq, + test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags), + timeout)) + return -EIO; + return 0; +} + +static void pci_doe_write_ctrl(struct pci_doe_mb *doe_mb, u32 val) +{ + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + + pci_write_config_dword(pdev, offset + PCI_DOE_CTRL, val); +} + +static int pci_doe_abort(struct pci_doe_mb *doe_mb) +{ + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + unsigned long timeout_jiffies; + + pci_dbg(pdev, "[%x] Issuing Abort\n", offset); + + timeout_jiffies = jiffies + PCI_DOE_TIMEOUT; + pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_ABORT); + + do { + int rc; + u32 val; + + rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL); + if (rc) + return rc; + pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val); + + /* Abort success! */ + if (!FIELD_GET(PCI_DOE_STATUS_ERROR, val) && + !FIELD_GET(PCI_DOE_STATUS_BUSY, val)) + return 0; + + } while (!time_after(jiffies, timeout_jiffies)); + + /* Abort has timed out and the MB is dead */ + pci_err(pdev, "[%x] ABORT timed out\n", offset); + return -EIO; +} + +static int pci_doe_send_req(struct pci_doe_mb *doe_mb, + struct pci_doe_task *task) +{ + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + u32 val; + int i; + + /* + * Check the DOE busy bit is not set. If it is set, this could indicate + * someone other than Linux (e.g. firmware) is using the mailbox. Note + * it is expected that firmware and OS will negotiate access rights via + * an, as yet to be defined, method. + */ + pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val); + if (FIELD_GET(PCI_DOE_STATUS_BUSY, val)) + return -EBUSY; + + if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) + return -EIO; + + /* Write DOE Header */ + val = FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_VID, task->prot.vid) | + FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, task->prot.type); + pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, val); + /* Length is 2 DW of header + length of payload in DW */ + pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, + FIELD_PREP(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, + 2 + task->request_pl_sz / + sizeof(u32))); + for (i = 0; i < task->request_pl_sz / sizeof(u32); i++) + pci_write_config_dword(pdev, offset + PCI_DOE_WRITE, + task->request_pl[i]); + + pci_doe_write_ctrl(doe_mb, PCI_DOE_CTRL_GO); + + return 0; +} + +static bool pci_doe_data_obj_ready(struct pci_doe_mb *doe_mb) +{ + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + u32 val; + + pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val); + if (FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) + return true; + return false; +} + +static int pci_doe_recv_resp(struct pci_doe_mb *doe_mb, struct pci_doe_task *task) +{ + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + size_t length, payload_length; + u32 val; + int i; + + /* Read the first dword to get the protocol */ + pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val); + if ((FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val) != task->prot.vid) || + (FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val) != task->prot.type)) { + dev_err_ratelimited(&pdev->dev, "[%x] expected [VID, Protocol] = [%04x, %02x], got [%04x, %02x]\n", + doe_mb->cap_offset, task->prot.vid, task->prot.type, + FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_VID, val), + FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_1_TYPE, val)); + return -EIO; + } + + pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); + /* Read the second dword to get the length */ + pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val); + pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); + + length = FIELD_GET(PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH, val); + if (length > SZ_1M || length < 2) + return -EIO; + + /* First 2 dwords have already been read */ + length -= 2; + payload_length = min(length, task->response_pl_sz / sizeof(u32)); + /* Read the rest of the response payload */ + for (i = 0; i < payload_length; i++) { + pci_read_config_dword(pdev, offset + PCI_DOE_READ, + &task->response_pl[i]); + /* Prior to the last ack, ensure Data Object Ready */ + if (i == (payload_length - 1) && !pci_doe_data_obj_ready(doe_mb)) + return -EIO; + pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); + } + + /* Flush excess length */ + for (; i < length; i++) { + pci_read_config_dword(pdev, offset + PCI_DOE_READ, &val); + pci_write_config_dword(pdev, offset + PCI_DOE_READ, 0); + } + + /* Final error check to pick up on any since Data Object Ready */ + pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val); + if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) + return -EIO; + + return min(length, task->response_pl_sz / sizeof(u32)) * sizeof(u32); +} + +static void signal_task_complete(struct pci_doe_task *task, int rv) +{ + task->rv = rv; + task->complete(task); +} + +static void signal_task_abort(struct pci_doe_task *task, int rv) +{ + struct pci_doe_mb *doe_mb = task->doe_mb; + struct pci_dev *pdev = doe_mb->pdev; + + if (pci_doe_abort(doe_mb)) { + /* + * If the device can't process an abort; set the mailbox dead + * - no more submissions + */ + pci_err(pdev, "[%x] Abort failed marking mailbox dead\n", + doe_mb->cap_offset); + set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags); + } + signal_task_complete(task, rv); +} + +static void doe_statemachine_work(struct work_struct *work) +{ + struct pci_doe_task *task = container_of(work, struct pci_doe_task, + work); + struct pci_doe_mb *doe_mb = task->doe_mb; + struct pci_dev *pdev = doe_mb->pdev; + int offset = doe_mb->cap_offset; + unsigned long timeout_jiffies; + u32 val; + int rc; + + if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) { + signal_task_complete(task, -EIO); + return; + } + + /* Send request */ + rc = pci_doe_send_req(doe_mb, task); + if (rc) { + /* + * The specification does not provide any guidance on how to + * resolve conflicting requests from other entities. + * Furthermore, it is likely that busy will not be detected + * most of the time. Flag any detection of status busy with an + * error. + */ + if (rc == -EBUSY) + dev_err_ratelimited(&pdev->dev, "[%x] busy detected; another entity is sending conflicting requests\n", + offset); + signal_task_abort(task, rc); + return; + } + + timeout_jiffies = jiffies + PCI_DOE_TIMEOUT; + /* Poll for response */ +retry_resp: + pci_read_config_dword(pdev, offset + PCI_DOE_STATUS, &val); + if (FIELD_GET(PCI_DOE_STATUS_ERROR, val)) { + signal_task_abort(task, -EIO); + return; + } + + if (!FIELD_GET(PCI_DOE_STATUS_DATA_OBJECT_READY, val)) { + if (time_after(jiffies, timeout_jiffies)) { + signal_task_abort(task, -EIO); + return; + } + rc = pci_doe_wait(doe_mb, PCI_DOE_POLL_INTERVAL); + if (rc) { + signal_task_abort(task, rc); + return; + } + goto retry_resp; + } + + rc = pci_doe_recv_resp(doe_mb, task); + if (rc < 0) { + signal_task_abort(task, rc); + return; + } + + signal_task_complete(task, rc); +} + +static void pci_doe_task_complete(struct pci_doe_task *task) +{ + complete(task->private); +} + +static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid, + u8 *protocol) +{ + u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX, + *index); + u32 response_pl; + DECLARE_COMPLETION_ONSTACK(c); + struct pci_doe_task task = { + .prot.vid = PCI_VENDOR_ID_PCI_SIG, + .prot.type = PCI_DOE_PROTOCOL_DISCOVERY, + .request_pl = &request_pl, + .request_pl_sz = sizeof(request_pl), + .response_pl = &response_pl, + .response_pl_sz = sizeof(response_pl), + .complete = pci_doe_task_complete, + .private = &c, + }; + int rc; + + rc = pci_doe_submit_task(doe_mb, &task); + if (rc < 0) + return rc; + + wait_for_completion(&c); + + if (task.rv != sizeof(response_pl)) + return -EIO; + + *vid = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, response_pl); + *protocol = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL, + response_pl); + *index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX, + response_pl); + + return 0; +} + +static void *pci_doe_xa_prot_entry(u16 vid, u8 prot) +{ + return xa_mk_value((vid << 8) | prot); +} + +static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb) +{ + u8 index = 0; + u8 xa_idx = 0; + + do { + int rc; + u16 vid; + u8 prot; + + rc = pci_doe_discovery(doe_mb, &index, &vid, &prot); + if (rc) + return rc; + + pci_dbg(doe_mb->pdev, + "[%x] Found protocol %d vid: %x prot: %x\n", + doe_mb->cap_offset, xa_idx, vid, prot); + + rc = xa_insert(&doe_mb->prots, xa_idx++, + pci_doe_xa_prot_entry(vid, prot), GFP_KERNEL); + if (rc) + return rc; + } while (index); + + return 0; +} + +static void pci_doe_xa_destroy(void *mb) +{ + struct pci_doe_mb *doe_mb = mb; + + xa_destroy(&doe_mb->prots); +} + +static void pci_doe_destroy_workqueue(void *mb) +{ + struct pci_doe_mb *doe_mb = mb; + + destroy_workqueue(doe_mb->work_queue); +} + +static void pci_doe_flush_mb(void *mb) +{ + struct pci_doe_mb *doe_mb = mb; + + /* Stop all pending work items from starting */ + set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags); + + /* Cancel an in progress work item, if necessary */ + set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags); + wake_up(&doe_mb->wq); + + /* Flush all work items */ + flush_workqueue(doe_mb->work_queue); +} + +/** + * pcim_doe_create_mb() - Create a DOE mailbox object + * + * @pdev: PCI device to create the DOE mailbox for + * @cap_offset: Offset of the DOE mailbox + * + * Create a single mailbox object to manage the mailbox protocol at the + * cap_offset specified. + * + * RETURNS: created mailbox object on success + * ERR_PTR(-errno) on failure + */ +struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset) +{ + struct pci_doe_mb *doe_mb; + struct device *dev = &pdev->dev; + int rc; + + doe_mb = devm_kzalloc(dev, sizeof(*doe_mb), GFP_KERNEL); + if (!doe_mb) + return ERR_PTR(-ENOMEM); + + doe_mb->pdev = pdev; + doe_mb->cap_offset = cap_offset; + init_waitqueue_head(&doe_mb->wq); + + xa_init(&doe_mb->prots); + rc = devm_add_action(dev, pci_doe_xa_destroy, doe_mb); + if (rc) + return ERR_PTR(rc); + + doe_mb->work_queue = alloc_ordered_workqueue("%s %s DOE [%x]", 0, + dev_driver_string(&pdev->dev), + pci_name(pdev), + doe_mb->cap_offset); + if (!doe_mb->work_queue) { + pci_err(pdev, "[%x] failed to allocate work queue\n", + doe_mb->cap_offset); + return ERR_PTR(-ENOMEM); + } + rc = devm_add_action_or_reset(dev, pci_doe_destroy_workqueue, doe_mb); + if (rc) + return ERR_PTR(rc); + + /* Reset the mailbox by issuing an abort */ + rc = pci_doe_abort(doe_mb); + if (rc) { + pci_err(pdev, "[%x] failed to reset mailbox with abort command : %d\n", + doe_mb->cap_offset, rc); + return ERR_PTR(rc); + } + + /* + * The state machine and the mailbox should be in sync now; + * Set up mailbox flush prior to using the mailbox to query protocols. + */ + rc = devm_add_action_or_reset(dev, pci_doe_flush_mb, doe_mb); + if (rc) + return ERR_PTR(rc); + + rc = pci_doe_cache_protocols(doe_mb); + if (rc) { + pci_err(pdev, "[%x] failed to cache protocols : %d\n", + doe_mb->cap_offset, rc); + return ERR_PTR(rc); + } + + return doe_mb; +} +EXPORT_SYMBOL_GPL(pcim_doe_create_mb); + +/** + * pci_doe_supports_prot() - Return if the DOE instance supports the given + * protocol + * @doe_mb: DOE mailbox capability to query + * @vid: Protocol Vendor ID + * @type: Protocol type + * + * RETURNS: True if the DOE mailbox supports the protocol specified + */ +bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type) +{ + unsigned long index; + void *entry; + + /* The discovery protocol must always be supported */ + if (vid == PCI_VENDOR_ID_PCI_SIG && type == PCI_DOE_PROTOCOL_DISCOVERY) + return true; + + xa_for_each(&doe_mb->prots, index, entry) + if (entry == pci_doe_xa_prot_entry(vid, type)) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(pci_doe_supports_prot); + +/** + * pci_doe_submit_task() - Submit a task to be processed by the state machine + * + * @doe_mb: DOE mailbox capability to submit to + * @task: task to be queued + * + * Submit a DOE task (request/response) to the DOE mailbox to be processed. + * Returns upon queueing the task object. If the queue is full this function + * will sleep until there is room in the queue. + * + * task->complete will be called when the state machine is done processing this + * task. + * + * Excess data will be discarded. + * + * RETURNS: 0 when task has been successfully queued, -ERRNO on error + */ +int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task) +{ + if (!pci_doe_supports_prot(doe_mb, task->prot.vid, task->prot.type)) + return -EINVAL; + + /* + * DOE requests must be a whole number of DW and the response needs to + * be big enough for at least 1 DW + */ + if (task->request_pl_sz % sizeof(u32) || + task->response_pl_sz < sizeof(u32)) + return -EINVAL; + + if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) + return -EIO; + + task->doe_mb = doe_mb; + INIT_WORK(&task->work, doe_statemachine_work); + queue_work(doe_mb->work_queue, &task->work); + return 0; +} +EXPORT_SYMBOL_GPL(pci_doe_submit_task); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 9884d8b29d3b..c5286b027f00 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2315,7 +2315,7 @@ EXPORT_SYMBOL(pci_alloc_dev); static bool pci_bus_crs_vendor_id(u32 l) { - return (l & 0xffff) == 0x0001; + return (l & 0xffff) == PCI_VENDOR_ID_PCI_SIG; } static bool pci_bus_wait_crs(struct pci_bus *bus, int devfn, u32 *l, diff --git a/include/linux/ioport.h b/include/linux/ioport.h index ec5f71f7135b..616b683563a9 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -141,6 +141,7 @@ enum { IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, + IORES_DESC_CXL = 9, }; /* @@ -329,6 +330,8 @@ struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, const char *name); static inline void irqresource_disabled(struct resource *res, u32 irq) { diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 0d61e07b6827..c74acfa1a3fe 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -59,6 +59,9 @@ enum { /* Platform provides asynchronous flush mechanism */ ND_REGION_ASYNC = 3, + /* Region was created by CXL subsystem */ + ND_REGION_CXL = 4, + /* mark newly adjusted resources as requiring a label update */ DPA_RESOURCE_ADJUSTED = 1 << 0, }; @@ -122,6 +125,7 @@ struct nd_region_desc { int numa_node; int target_node; unsigned long flags; + int memregion; struct device_node *of_node; int (*flush)(struct nd_region *nd_region, struct bio *bio); }; @@ -259,6 +263,7 @@ static inline struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, cmd_mask, num_flush, flush_wpq, NULL, NULL, NULL); } void nvdimm_delete(struct nvdimm *nvdimm); +void nvdimm_region_delete(struct nd_region *nd_region); const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd); const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd); diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h new file mode 100644 index 000000000000..ed9b4df792b8 --- /dev/null +++ b/include/linux/pci-doe.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Object Exchange + * PCIe r6.0, sec 6.30 DOE + * + * Copyright (C) 2021 Huawei + * Jonathan Cameron <Jonathan.Cameron@huawei.com> + * + * Copyright (C) 2022 Intel Corporation + * Ira Weiny <ira.weiny@intel.com> + */ + +#ifndef LINUX_PCI_DOE_H +#define LINUX_PCI_DOE_H + +struct pci_doe_protocol { + u16 vid; + u8 type; +}; + +struct pci_doe_mb; + +/** + * struct pci_doe_task - represents a single query/response + * + * @prot: DOE Protocol + * @request_pl: The request payload + * @request_pl_sz: Size of the request payload (bytes) + * @response_pl: The response payload + * @response_pl_sz: Size of the response payload (bytes) + * @rv: Return value. Length of received response or error (bytes) + * @complete: Called when task is complete + * @private: Private data for the consumer + * @work: Used internally by the mailbox + * @doe_mb: Used internally by the mailbox + * + * The payload sizes and rv are specified in bytes with the following + * restrictions concerning the protocol. + * + * 1) The request_pl_sz must be a multiple of double words (4 bytes) + * 2) The response_pl_sz must be >= a single double word (4 bytes) + * 3) rv is returned as bytes but it will be a multiple of double words + * + * NOTE there is no need for the caller to initialize work or doe_mb. + */ +struct pci_doe_task { + struct pci_doe_protocol prot; + u32 *request_pl; + size_t request_pl_sz; + u32 *response_pl; + size_t response_pl_sz; + int rv; + void (*complete)(struct pci_doe_task *task); + void *private; + + /* No need for the user to initialize these fields */ + struct work_struct work; + struct pci_doe_mb *doe_mb; +}; + +/** + * pci_doe_for_each_off - Iterate each DOE capability + * @pdev: struct pci_dev to iterate + * @off: u16 of config space offset of each mailbox capability found + */ +#define pci_doe_for_each_off(pdev, off) \ + for (off = pci_find_next_ext_capability(pdev, off, \ + PCI_EXT_CAP_ID_DOE); \ + off > 0; \ + off = pci_find_next_ext_capability(pdev, off, \ + PCI_EXT_CAP_ID_DOE)) + +struct pci_doe_mb *pcim_doe_create_mb(struct pci_dev *pdev, u16 cap_offset); +bool pci_doe_supports_prot(struct pci_doe_mb *doe_mb, u16 vid, u8 type); +int pci_doe_submit_task(struct pci_doe_mb *doe_mb, struct pci_doe_task *task); + +#endif diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 7fa460ccf7fa..6feade66efdb 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -151,6 +151,7 @@ #define PCI_CLASS_OTHERS 0xff /* Vendors and devices. Sort key: vendor first, device next. */ +#define PCI_VENDOR_ID_PCI_SIG 0x0001 #define PCI_VENDOR_ID_LOONGSON 0x0014 diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index e3f1e8ac1f85..fd3fe5c8c17f 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -235,6 +235,22 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) + +#define __BIN_ATTR_ADMIN_RO(_name, _size) { \ + .attr = { .name = __stringify(_name), .mode = 0400 }, \ + .read = _name##_read, \ + .size = _size, \ +} + +#define __BIN_ATTR_ADMIN_RW(_name, _size) \ + __BIN_ATTR(_name, 0600, _name##_read, _name##_write, _size) + +#define BIN_ATTR_ADMIN_RO(_name, _size) \ +struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size) + +#define BIN_ATTR_ADMIN_RW(_name, _size) \ +struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size) + struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 108f8523fa04..57b8e2ffb1dd 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -737,7 +737,8 @@ #define PCI_EXT_CAP_ID_DVSEC 0x23 /* Designated Vendor-Specific */ #define PCI_EXT_CAP_ID_DLF 0x25 /* Data Link Feature */ #define PCI_EXT_CAP_ID_PL_16GT 0x26 /* Physical Layer 16.0 GT/s */ -#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_16GT +#define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_DOE #define PCI_EXT_CAP_DSN_SIZEOF 12 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 @@ -1103,4 +1104,30 @@ #define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_MASK 0x000000F0 #define PCI_PL_16GT_LE_CTRL_USP_TX_PRESET_SHIFT 4 +/* Data Object Exchange */ +#define PCI_DOE_CAP 0x04 /* DOE Capabilities Register */ +#define PCI_DOE_CAP_INT_SUP 0x00000001 /* Interrupt Support */ +#define PCI_DOE_CAP_INT_MSG_NUM 0x00000ffe /* Interrupt Message Number */ +#define PCI_DOE_CTRL 0x08 /* DOE Control Register */ +#define PCI_DOE_CTRL_ABORT 0x00000001 /* DOE Abort */ +#define PCI_DOE_CTRL_INT_EN 0x00000002 /* DOE Interrupt Enable */ +#define PCI_DOE_CTRL_GO 0x80000000 /* DOE Go */ +#define PCI_DOE_STATUS 0x0c /* DOE Status Register */ +#define PCI_DOE_STATUS_BUSY 0x00000001 /* DOE Busy */ +#define PCI_DOE_STATUS_INT_STATUS 0x00000002 /* DOE Interrupt Status */ +#define PCI_DOE_STATUS_ERROR 0x00000004 /* DOE Error */ +#define PCI_DOE_STATUS_DATA_OBJECT_READY 0x80000000 /* Data Object Ready */ +#define PCI_DOE_WRITE 0x10 /* DOE Write Data Mailbox Register */ +#define PCI_DOE_READ 0x14 /* DOE Read Data Mailbox Register */ + +/* DOE Data Object - note not actually registers */ +#define PCI_DOE_DATA_OBJECT_HEADER_1_VID 0x0000ffff +#define PCI_DOE_DATA_OBJECT_HEADER_1_TYPE 0x00ff0000 +#define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH 0x0003ffff + +#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX 0x000000ff +#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID 0x0000ffff +#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL 0x00ff0000 +#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX 0xff000000 + #endif /* LINUX_PCI_REGS_H */ diff --git a/kernel/resource.c b/kernel/resource.c index 34eaee179689..4c5e80b92f2f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -489,8 +489,9 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); -static int __region_intersects(resource_size_t start, size_t size, - unsigned long flags, unsigned long desc) +static int __region_intersects(struct resource *parent, resource_size_t start, + size_t size, unsigned long flags, + unsigned long desc) { struct resource res; int type = 0; int other = 0; @@ -499,7 +500,7 @@ static int __region_intersects(resource_size_t start, size_t size, res.start = start; res.end = start + size - 1; - for (p = iomem_resource.child; p ; p = p->sibling) { + for (p = parent->child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); @@ -543,7 +544,7 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, int ret; read_lock(&resource_lock); - ret = __region_intersects(start, size, flags, desc); + ret = __region_intersects(&iomem_resource, start, size, flags, desc); read_unlock(&resource_lock); return ret; @@ -891,6 +892,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) } write_unlock(&resource_lock); } +/* + * Not for general consumption, only early boot memory map parsing, PCI + * resource discovery, and late discovery of CXL resources are expected + * to use this interface. The former are built-in and only the latter, + * CXL, is a module. + */ +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); /** * remove_resource - Remove a resource in the resource tree @@ -1773,62 +1781,139 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); -#ifdef CONFIG_DEVICE_PRIVATE -static struct resource *__request_free_mem_region(struct device *dev, - struct resource *base, unsigned long size, const char *name) +#ifdef CONFIG_GET_FREE_REGION +#define GFR_DESCENDING (1UL << 0) +#define GFR_REQUEST_REGION (1UL << 1) +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) + +static resource_size_t gfr_start(struct resource *base, resource_size_t size, + resource_size_t align, unsigned long flags) +{ + if (flags & GFR_DESCENDING) { + resource_size_t end; + + end = min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); + return end - size + 1; + } + + return ALIGN(base->start, align); +} + +static bool gfr_continue(struct resource *base, resource_size_t addr, + resource_size_t size, unsigned long flags) +{ + if (flags & GFR_DESCENDING) + return addr > size && addr >= base->start; + /* + * In the ascend case be careful that the last increment by + * @size did not wrap 0. + */ + return addr > addr - size && + addr <= min_t(resource_size_t, base->end, + (1ULL << MAX_PHYSMEM_BITS) - 1); +} + +static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, + unsigned long flags) { - resource_size_t end, addr; + if (flags & GFR_DESCENDING) + return addr - size; + return addr + size; +} + +static void remove_free_mem_region(void *_res) +{ + struct resource *res = _res; + + if (res->parent) + remove_resource(res); + free_resource(res); +} + +static struct resource * +get_free_mem_region(struct device *dev, struct resource *base, + resource_size_t size, const unsigned long align, + const char *name, const unsigned long desc, + const unsigned long flags) +{ + resource_size_t addr; struct resource *res; struct region_devres *dr = NULL; - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); - addr = end - size + 1UL; + size = ALIGN(size, align); res = alloc_resource(GFP_KERNEL); if (!res) return ERR_PTR(-ENOMEM); - if (dev) { + if (dev && (flags & GFR_REQUEST_REGION)) { dr = devres_alloc(devm_region_release, sizeof(struct region_devres), GFP_KERNEL); if (!dr) { free_resource(res); return ERR_PTR(-ENOMEM); } + } else if (dev) { + if (devm_add_action_or_reset(dev, remove_free_mem_region, res)) + return ERR_PTR(-ENOMEM); } write_lock(&resource_lock); - for (; addr > size && addr >= base->start; addr -= size) { - if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != - REGION_DISJOINT) + for (addr = gfr_start(base, size, align, flags); + gfr_continue(base, addr, size, flags); + addr = gfr_next(addr, size, flags)) { + if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) continue; - if (__request_region_locked(res, &iomem_resource, addr, size, - name, 0)) - break; + if (flags & GFR_REQUEST_REGION) { + if (__request_region_locked(res, &iomem_resource, addr, + size, name, 0)) + break; - if (dev) { - dr->parent = &iomem_resource; - dr->start = addr; - dr->n = size; - devres_add(dev, dr); - } + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - write_unlock(&resource_lock); + res->desc = desc; + write_unlock(&resource_lock); + + + /* + * A driver is claiming this region so revoke any + * mappings. + */ + revoke_iomem(res); + } else { + res->start = addr; + res->end = addr + size - 1; + res->name = name; + res->desc = desc; + res->flags = IORESOURCE_MEM; + + /* + * Only succeed if the resource hosts an exclusive + * range after the insert + */ + if (__insert_resource(base, res) || res->child) + break; + + write_unlock(&resource_lock); + } - /* - * A driver is claiming this region so revoke any mappings. - */ - revoke_iomem(res); return res; } write_unlock(&resource_lock); - free_resource(res); - if (dr) + if (flags & GFR_REQUEST_REGION) { + free_resource(res); devres_free(dr); + } else if (dev) + devm_release_action(dev, remove_free_mem_region, res); return ERR_PTR(-ERANGE); } @@ -1847,18 +1932,48 @@ static struct resource *__request_free_mem_region(struct device *dev, struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size) { - return __request_free_mem_region(dev, base, size, dev_name(dev)); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(dev, base, size, GFR_DEFAULT_ALIGN, + dev_name(dev), + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(devm_request_free_mem_region); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name) { - return __request_free_mem_region(NULL, base, size, name); + unsigned long flags = GFR_DESCENDING | GFR_REQUEST_REGION; + + return get_free_mem_region(NULL, base, size, GFR_DEFAULT_ALIGN, name, + IORES_DESC_DEVICE_PRIVATE_MEMORY, flags); } EXPORT_SYMBOL_GPL(request_free_mem_region); -#endif /* CONFIG_DEVICE_PRIVATE */ +/** + * alloc_free_mem_region - find a free region relative to @base + * @base: resource that will parent the new resource + * @size: size in bytes of memory to allocate from @base + * @align: alignment requirements for the allocation + * @name: resource name + * + * Buses like CXL, that can dynamically instantiate new memory regions, + * need a method to allocate physical address space for those regions. + * Allocate and insert a new resource to cover a free, unclaimed by a + * descendant of @base, range in the span of @base. + */ +struct resource *alloc_free_mem_region(struct resource *base, + unsigned long size, unsigned long align, + const char *name) +{ + /* Default of ascending direction and insert resource */ + unsigned long flags = 0; + + return get_free_mem_region(NULL, base, size, align, name, + IORES_DESC_NONE, flags); +} +EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +#endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) { diff --git a/mm/Kconfig b/mm/Kconfig index e59cf5fe5ce9..0331f1461f81 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -983,9 +983,14 @@ config HMM_MIRROR bool depends on MMU +config GET_FREE_REGION + depends on SPARSEMEM + bool + config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" depends on ZONE_DEVICE + select GET_FREE_REGION help Allows creation of struct pages to represent unaddressable device diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 33543231d453..500be85729cc 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -47,6 +47,7 @@ cxl_core-y += $(CXL_CORE_SRC)/memdev.o cxl_core-y += $(CXL_CORE_SRC)/mbox.o cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o +cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-y += config_check.o obj-m += test/ diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 431f2bddf6c8..a072b2d3e726 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -14,7 +14,7 @@ #define NR_CXL_HOST_BRIDGES 2 #define NR_CXL_ROOT_PORTS 2 #define NR_CXL_SWITCH_PORTS 2 -#define NR_CXL_PORT_DECODERS 2 +#define NR_CXL_PORT_DECODERS 8 static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; @@ -118,7 +118,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = 0, - .window_size = SZ_256M, + .window_size = SZ_256M * 4UL, }, .target = { 0 }, }, @@ -133,7 +133,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = 1, - .window_size = SZ_256M * 2, + .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, }, @@ -148,7 +148,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = 2, - .window_size = SZ_256M, + .window_size = SZ_256M * 4UL, }, .target = { 0 }, }, @@ -163,7 +163,7 @@ static struct { .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = 3, - .window_size = SZ_256M * 2, + .window_size = SZ_256M * 8UL, }, .target = { 0, 1, }, }, @@ -429,6 +429,50 @@ static int map_targets(struct device *dev, void *data) return 0; } +static int mock_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + int id = cxld->id; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + dev_dbg(&port->dev, "%s commit\n", dev_name(&cxld->dev)); + if (port->commit_end + 1 != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end + 1); + return -EBUSY; + } + + port->commit_end++; + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int mock_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + int id = cxld->id; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return 0; + + dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev)); + if (port->commit_end != id) { + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + return -EBUSY; + } + + port->commit_end--; + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + return 0; +} + static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { struct cxl_port *port = cxlhdm->port; @@ -451,25 +495,39 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) struct cxl_decoder *cxld; int rc; - if (target_count) - cxld = cxl_switch_decoder_alloc(port, target_count); - else - cxld = cxl_endpoint_decoder_alloc(port); - if (IS_ERR(cxld)) { - dev_warn(&port->dev, - "Failed to allocate the decoder\n"); - return PTR_ERR(cxld); + if (target_count) { + struct cxl_switch_decoder *cxlsd; + + cxlsd = cxl_switch_decoder_alloc(port, target_count); + if (IS_ERR(cxlsd)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxlsd); + } + cxld = &cxlsd->cxld; + } else { + struct cxl_endpoint_decoder *cxled; + + cxled = cxl_endpoint_decoder_alloc(port); + + if (IS_ERR(cxled)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxled); + } + cxld = &cxled->cxld; } - cxld->decoder_range = (struct range) { + cxld->hpa_range = (struct range) { .start = 0, .end = -1, }; - cxld->flags = CXL_DECODER_F_ENABLE; cxld->interleave_ways = min_not_zero(target_count, 1); cxld->interleave_granularity = SZ_4K; cxld->target_type = CXL_DECODER_EXPANDER; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; if (target_count) { rc = device_for_each_child(port->uport, &ctx, @@ -569,44 +627,6 @@ static void mock_companion(struct acpi_device *adev, struct device *dev) #define SZ_512G (SZ_64G * 8) #endif -static struct platform_device *alloc_memdev(int id) -{ - struct resource res[] = { - [0] = { - .flags = IORESOURCE_MEM, - }, - [1] = { - .flags = IORESOURCE_MEM, - .desc = IORES_DESC_PERSISTENT_MEMORY, - }, - }; - struct platform_device *pdev; - int i, rc; - - for (i = 0; i < ARRAY_SIZE(res); i++) { - struct cxl_mock_res *r = alloc_mock_res(SZ_256M); - - if (!r) - return NULL; - res[i].start = r->range.start; - res[i].end = r->range.end; - } - - pdev = platform_device_alloc("cxl_mem", id); - if (!pdev) - return NULL; - - rc = platform_device_add_resources(pdev, res, ARRAY_SIZE(res)); - if (rc) - goto err; - - return pdev; - -err: - platform_device_put(pdev); - return NULL; -} - static __init int cxl_test_init(void) { int rc, i; @@ -619,7 +639,8 @@ static __init int cxl_test_init(void) goto err_gen_pool_create; } - rc = gen_pool_add(cxl_mock_pool, SZ_512G, SZ_64G, NUMA_NO_NODE); + rc = gen_pool_add(cxl_mock_pool, iomem_resource.end + 1 - SZ_64G, + SZ_64G, NUMA_NO_NODE); if (rc) goto err_gen_pool_add; @@ -708,7 +729,7 @@ static __init int cxl_test_init(void) struct platform_device *dport = cxl_switch_dport[i]; struct platform_device *pdev; - pdev = alloc_memdev(i); + pdev = platform_device_alloc("cxl_mem", i); if (!pdev) goto err_mem; pdev->dev.parent = &dport->dev; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 6b9239b2afd4..aa2df3a15051 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -10,6 +10,7 @@ #include <cxlmem.h> #define LSA_SIZE SZ_128K +#define DEV_SIZE SZ_2G #define EFFECT(x) (1U << x) static struct cxl_cel_entry mock_cel[] = { @@ -26,6 +27,10 @@ static struct cxl_cel_entry mock_cel[] = { .effect = cpu_to_le16(0), }, { + .opcode = cpu_to_le16(CXL_MBOX_OP_GET_PARTITION_INFO), + .effect = cpu_to_le16(0), + }, + { .opcode = cpu_to_le16(CXL_MBOX_OP_SET_LSA), .effect = cpu_to_le16(EFFECT(1) | EFFECT(2)), }, @@ -97,42 +102,37 @@ static int mock_get_log(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) static int mock_id(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) { - struct platform_device *pdev = to_platform_device(cxlds->dev); struct cxl_mbox_identify id = { .fw_revision = { "mock fw v1 " }, .lsa_size = cpu_to_le32(LSA_SIZE), - /* FIXME: Add partition support */ - .partition_align = cpu_to_le64(0), + .partition_align = + cpu_to_le64(SZ_256M / CXL_CAPACITY_MULTIPLIER), + .total_capacity = + cpu_to_le64(DEV_SIZE / CXL_CAPACITY_MULTIPLIER), }; - u64 capacity = 0; - int i; if (cmd->size_out < sizeof(id)) return -EINVAL; - for (i = 0; i < 2; i++) { - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, i); - if (!res) - break; - - capacity += resource_size(res) / CXL_CAPACITY_MULTIPLIER; + memcpy(cmd->payload_out, &id, sizeof(id)); - if (le64_to_cpu(id.partition_align)) - continue; + return 0; +} - if (res->desc == IORES_DESC_PERSISTENT_MEMORY) - id.persistent_capacity = cpu_to_le64( - resource_size(res) / CXL_CAPACITY_MULTIPLIER); - else - id.volatile_capacity = cpu_to_le64( - resource_size(res) / CXL_CAPACITY_MULTIPLIER); - } +static int mock_partition_info(struct cxl_dev_state *cxlds, + struct cxl_mbox_cmd *cmd) +{ + struct cxl_mbox_get_partition_info pi = { + .active_volatile_cap = + cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER), + .active_persistent_cap = + cpu_to_le64(DEV_SIZE / 2 / CXL_CAPACITY_MULTIPLIER), + }; - id.total_capacity = cpu_to_le64(capacity); + if (cmd->size_out < sizeof(pi)) + return -EINVAL; - memcpy(cmd->payload_out, &id, sizeof(id)); + memcpy(cmd->payload_out, &pi, sizeof(pi)); return 0; } @@ -221,6 +221,9 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd * case CXL_MBOX_OP_GET_LSA: rc = mock_get_lsa(cxlds, cmd); break; + case CXL_MBOX_OP_GET_PARTITION_INFO: + rc = mock_partition_info(cxlds, cmd); + break; case CXL_MBOX_OP_SET_LSA: rc = mock_set_lsa(cxlds, cmd); break; @@ -282,7 +285,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); - if (range_len(&cxlds->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM)) + if (resource_size(&cxlds->pmem_res) && IS_ENABLED(CONFIG_CXL_PMEM)) rc = devm_cxl_add_nvdimm(dev, cxlmd); return 0; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index f1f8c40948c5..bce6a21df0d5 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -208,13 +208,15 @@ int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_await_media_ready, CXL); -bool __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, - struct cxl_hdm *cxlhdm) +int __wrap_cxl_hdm_decode_init(struct cxl_dev_state *cxlds, + struct cxl_hdm *cxlhdm) { int rc = 0, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - if (!ops || !ops->is_mock_dev(cxlds->dev)) + if (ops && ops->is_mock_dev(cxlds->dev)) + rc = 0; + else rc = cxl_hdm_decode_init(cxlds, cxlhdm); put_cxl_mock_ops(index); |