summaryrefslogtreecommitdiff
path: root/drivers/cxl
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/cxl')
-rw-r--r--drivers/cxl/Kconfig180
-rw-r--r--drivers/cxl/Makefile22
-rw-r--r--drivers/cxl/acpi.c1038
-rw-r--r--drivers/cxl/core.c1067
-rw-r--r--drivers/cxl/core/Makefile22
-rw-r--r--drivers/cxl/core/cdat.c1074
-rw-r--r--drivers/cxl/core/core.h169
-rw-r--r--drivers/cxl/core/edac.c2109
-rw-r--r--drivers/cxl/core/features.c706
-rw-r--r--drivers/cxl/core/hdm.c1287
-rw-r--r--drivers/cxl/core/mbox.c1552
-rw-r--r--drivers/cxl/core/mce.c65
-rw-r--r--drivers/cxl/core/mce.h20
-rw-r--r--drivers/cxl/core/memdev.c1161
-rw-r--r--drivers/cxl/core/pci.c1189
-rw-r--r--drivers/cxl/core/pmem.c287
-rw-r--r--drivers/cxl/core/pmu.c68
-rw-r--r--drivers/cxl/core/port.c2532
-rw-r--r--drivers/cxl/core/ras.c126
-rw-r--r--drivers/cxl/core/region.c4003
-rw-r--r--drivers/cxl/core/regs.c644
-rw-r--r--drivers/cxl/core/suspend.c24
-rw-r--r--drivers/cxl/core/trace.c8
-rw-r--r--drivers/cxl/core/trace.h1105
-rw-r--r--drivers/cxl/cxl.h873
-rw-r--r--drivers/cxl/cxlmem.h931
-rw-r--r--drivers/cxl/cxlpci.h135
-rw-r--r--drivers/cxl/mem.c265
-rw-r--r--drivers/cxl/mem.h81
-rw-r--r--drivers/cxl/pci.c2121
-rw-r--r--drivers/cxl/pci.h31
-rw-r--r--drivers/cxl/pmem.c554
-rw-r--r--drivers/cxl/pmu.h28
-rw-r--r--drivers/cxl/port.c182
-rw-r--r--drivers/cxl/security.c206
35 files changed, 22915 insertions, 2950 deletions
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index e6de221cc568..48b7314afdb8 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -2,6 +2,12 @@
menuconfig CXL_BUS
tristate "CXL (Compute Express Link) Devices Support"
depends on PCI
+ select FW_LOADER
+ select FW_UPLOAD
+ select PCI_DOE
+ select FIRMWARE_TABLE
+ select NUMA_KEEP_MEMINFO if NUMA_MEMBLKS
+ select FWCTL if CXL_FEATURES
help
CXL is a bus that is electrically compatible with PCI Express, but
layers three protocols on that signalling (CXL.io, CXL.cache, and
@@ -13,25 +19,26 @@ menuconfig CXL_BUS
if CXL_BUS
-config CXL_MEM
- tristate "CXL.mem: Memory Devices"
+config CXL_PCI
+ tristate "PCI manageability"
default CXL_BUS
help
- The CXL.mem protocol allows a device to act as a provider of
- "System RAM" and/or "Persistent Memory" that is fully coherent
- as if the memory was attached to the typical CPU memory
- controller.
+ The CXL specification defines a "CXL memory device" sub-class in the
+ PCI "memory controller" base class of devices. Device's identified by
+ this class code provide support for volatile and / or persistent
+ memory to be mapped into the system address map (Host-managed Device
+ Memory (HDM)).
- Say 'y/m' to enable a driver that will attach to CXL.mem devices for
- configuration and management primarily via the mailbox interface. See
- Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification for more
- details.
+ Say 'y/m' to enable a driver that will attach to CXL memory expander
+ devices enumerated by the memory device class code for configuration
+ and management primarily via the mailbox interface. See Chapter 2.3
+ Type 3 CXL Device in the CXL 2.0 specification for more details.
If unsure say 'm'.
config CXL_MEM_RAW_COMMANDS
bool "RAW Command Interface for Memory Devices"
- depends on CXL_MEM
+ depends on CXL_PCI
help
Enable CXL RAW command interface.
@@ -50,7 +57,11 @@ config CXL_MEM_RAW_COMMANDS
config CXL_ACPI
tristate "CXL ACPI: Platform Support"
depends on ACPI
+ depends on ACPI_NUMA
default CXL_BUS
+ select ACPI_TABLE_LIB
+ select ACPI_HMAT
+ select CXL_PORT
help
Enable support for host managed device memory (HDM) resources
published by a platform's ACPI CXL memory layout description. See
@@ -75,4 +86,151 @@ config CXL_PMEM
provisioning the persistent memory capacity of CXL memory expanders.
If unsure say 'm'.
+
+config CXL_MEM
+ tristate "CXL: Memory Expansion"
+ depends on CXL_PCI
+ default CXL_BUS
+ help
+ The CXL.mem protocol allows a device to act as a provider of "System
+ RAM" and/or "Persistent Memory" that is fully coherent as if the
+ memory were attached to the typical CPU memory controller. This is
+ known as HDM "Host-managed Device Memory".
+
+ Say 'y/m' to enable a driver that will attach to CXL.mem devices for
+ memory expansion and control of HDM. See Chapter 9.13 in the CXL 2.0
+ specification for a detailed description of HDM.
+
+ If unsure say 'm'.
+
+config CXL_FEATURES
+ bool "CXL: Features"
+ depends on CXL_PCI
+ help
+ Enable support for CXL Features. A CXL device that includes a mailbox
+ supports commands that allows listing, getting, and setting of
+ optionally defined features such as memory sparing or post package
+ sparing. Vendors may define custom features for the device.
+
+ If unsure say 'n'
+
+config CXL_EDAC_MEM_FEATURES
+ bool "CXL: EDAC Memory Features"
+ depends on EXPERT
+ depends on CXL_MEM
+ depends on CXL_FEATURES
+ depends on EDAC >= CXL_BUS
+ help
+ The CXL EDAC memory feature is optional and allows host to
+ control the EDAC memory features configurations of CXL memory
+ expander devices.
+
+ Say 'y' if you have an expert need to change default settings
+ of a memory RAS feature established by the platform/device.
+ Otherwise say 'n'.
+
+config CXL_EDAC_SCRUB
+ bool "Enable CXL Patrol Scrub Control (Patrol Read)"
+ depends on CXL_EDAC_MEM_FEATURES
+ depends on EDAC_SCRUB
+ help
+ The CXL EDAC scrub control is optional and allows host to
+ control the scrub feature configurations of CXL memory expander
+ devices.
+
+ When enabled 'cxl_mem' and 'cxl_region' EDAC devices are
+ published with memory scrub control attributes as described by
+ Documentation/ABI/testing/sysfs-edac-scrub.
+
+ Say 'y' if you have an expert need to change default settings
+ of a memory scrub feature established by the platform/device
+ (e.g. scrub rates for the patrol scrub feature).
+ Otherwise say 'n'.
+
+config CXL_EDAC_ECS
+ bool "Enable CXL Error Check Scrub (Repair)"
+ depends on CXL_EDAC_MEM_FEATURES
+ depends on EDAC_ECS
+ help
+ The CXL EDAC ECS control is optional and allows host to
+ control the ECS feature configurations of CXL memory expander
+ devices.
+
+ When enabled 'cxl_mem' EDAC devices are published with memory
+ ECS control attributes as described by
+ Documentation/ABI/testing/sysfs-edac-ecs.
+
+ Say 'y' if you have an expert need to change default settings
+ of a memory ECS feature established by the platform/device.
+ Otherwise say 'n'.
+
+config CXL_EDAC_MEM_REPAIR
+ bool "Enable CXL Memory Repair"
+ depends on CXL_EDAC_MEM_FEATURES
+ depends on EDAC_MEM_REPAIR
+ help
+ The CXL EDAC memory repair control is optional and allows host
+ to control the memory repair features (e.g. sparing, PPR)
+ configurations of CXL memory expander devices.
+
+ When enabled, the memory repair feature requires an additional
+ memory of approximately 43KB to store CXL DRAM and CXL general
+ media event records.
+
+ When enabled 'cxl_mem' EDAC devices are published with memory
+ repair control attributes as described by
+ Documentation/ABI/testing/sysfs-edac-memory-repair.
+
+ Say 'y' if you have an expert need to change default settings
+ of a memory repair feature established by the platform/device.
+ Otherwise say 'n'.
+
+config CXL_PORT
+ default CXL_BUS
+ tristate
+
+config CXL_SUSPEND
+ def_bool y
+ depends on SUSPEND && CXL_MEM
+
+config CXL_REGION
+ bool "CXL: Region Support"
+ default CXL_BUS
+ # For MAX_PHYSMEM_BITS
+ depends on SPARSEMEM
+ select MEMREGION
+ select GET_FREE_REGION
+ help
+ Enable the CXL core to enumerate and provision CXL regions. A CXL
+ region is defined by one or more CXL expanders that decode a given
+ system-physical address range. For CXL regions established by
+ platform-firmware this option enables memory error handling to
+ identify the devices participating in a given interleaved memory
+ range. Otherwise, platform-firmware managed CXL is enabled by being
+ placed in the system address map and does not need a driver.
+
+ If unsure say 'y'
+
+config CXL_REGION_INVALIDATION_TEST
+ bool "CXL: Region Cache Management Bypass (TEST)"
+ depends on CXL_REGION
+ help
+ CXL Region management and security operations potentially invalidate
+ the content of CPU caches without notifying those caches to
+ invalidate the affected cachelines. The CXL Region driver attempts
+ to invalidate caches when those events occur. If that invalidation
+ fails the region will fail to enable. Reasons for cache
+ invalidation failure are due to the CPU not providing a cache
+ invalidation mechanism. For example usage of wbinvd is restricted to
+ bare metal x86. However, for testing purposes toggling this option
+ can disable that data integrity safety and proceed with enabling
+ regions when there might be conflicting contents in the CPU cache.
+
+ If unsure, or if this kernel is meant for production environments,
+ say N.
+
+config CXL_MCE
+ def_bool y
+ depends on X86_MCE && MEMORY_FAILURE
+
endif
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index 32954059b37b..2caa90fa4bf2 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -1,11 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_CXL_BUS) += cxl_core.o
-obj-$(CONFIG_CXL_MEM) += cxl_pci.o
+
+# Order is important here for the built-in case:
+# - 'core' first for fundamental init
+# - 'port' before platform root drivers like 'acpi' so that CXL-root ports
+# are immediately enabled
+# - 'mem' and 'pmem' before endpoint drivers so that memdevs are
+# immediately enabled
+# - 'pci' last, also mirrors the hardware enumeration hierarchy
+obj-y += core/
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-ccflags-y += -DDEFAULT_SYMBOL_NAMESPACE=CXL
-cxl_core-y := core.o
-cxl_pci-y := pci.o
+cxl_port-y := port.o
cxl_acpi-y := acpi.o
-cxl_pmem-y := pmem.o
+cxl_pmem-y := pmem.o security.o
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 8ae89273f58e..77ac940e3013 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -6,21 +6,138 @@
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/pci.h>
+#include <linux/node.h>
+#include <asm/div64.h>
+#include "cxlpci.h"
#include "cxl.h"
-static struct acpi_table_header *acpi_cedt;
+static const guid_t acpi_cxl_qtg_id_guid =
+ GUID_INIT(0xF365F9A6, 0xA7DE, 0x4071,
+ 0xA6, 0x6A, 0xB4, 0x0C, 0x0B, 0x4F, 0x8E, 0x52);
-/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */
-#define CFMWS_INTERLEAVE_WAYS(x) (1 << (x)->interleave_ways)
-#define CFMWS_INTERLEAVE_GRANULARITY(x) ((x)->granularity + 8)
+#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1)
+static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = {
+ [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4
+};
+
+static const int valid_hbiw[] = { 1, 2, 3, 4, 6, 8, 12, 16 };
+
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
+{
+ int nr_maps_to_apply = -1;
+ u64 val;
+ int pos;
+
+ /*
+ * Strictly validate hbiw since this function is used for testing and
+ * that nullifies any expectation of trusted parameters from the CXL
+ * Region Driver.
+ */
+ for (int i = 0; i < ARRAY_SIZE(valid_hbiw); i++) {
+ if (valid_hbiw[i] == hbiw) {
+ nr_maps_to_apply = hbiw_to_nr_maps[hbiw];
+ break;
+ }
+ }
+ if (nr_maps_to_apply == -1 || nr_maps_to_apply > cximsd->nr_maps)
+ return ULLONG_MAX;
+
+ /*
+ * In regions using XOR interleave arithmetic the CXL HPA may not
+ * be the same as the SPA. This helper performs the SPA->CXL HPA
+ * or the CXL HPA->SPA translation. Since XOR is self-inverting,
+ * so is this function.
+ *
+ * For root decoders using xormaps (hbiw: 2,4,6,8,12,16) applying the
+ * xormaps will toggle a position bit.
+ *
+ * pos is the lowest set bit in an XORMAP
+ * val is the XORALLBITS(addr & XORMAP)
+ *
+ * XORALLBITS: The CXL spec (3.1 Table 9-22) defines XORALLBITS
+ * as an operation that outputs a single bit by XORing all the
+ * bits in the input (addr & xormap). Implement XORALLBITS using
+ * hweight64(). If the hamming weight is even the XOR of those
+ * bits results in val==0, if odd the XOR result is val==1.
+ */
+
+ for (int i = 0; i < cximsd->nr_maps; i++) {
+ if (!cximsd->xormaps[i])
+ continue;
+ pos = __ffs(cximsd->xormaps[i]);
+ val = (hweight64(addr & cximsd->xormaps[i]) & 1);
+ addr = (addr & ~(1ULL << pos)) | (val << pos);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_do_xormap_calc, "cxl_translate");
+
+static u64 cxl_apply_xor_maps(struct cxl_root_decoder *cxlrd, u64 addr)
+{
+ struct cxl_cxims_data *cximsd = cxlrd->platform_data;
+
+ return cxl_do_xormap_calc(cximsd, addr, cxlrd->cxlsd.nr_targets);
+}
+
+struct cxl_cxims_context {
+ struct device *dev;
+ struct cxl_root_decoder *cxlrd;
+};
+
+static int cxl_parse_cxims(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
+{
+ struct acpi_cedt_cxims *cxims = (struct acpi_cedt_cxims *)header;
+ struct cxl_cxims_context *ctx = arg;
+ struct cxl_root_decoder *cxlrd = ctx->cxlrd;
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct device *dev = ctx->dev;
+ struct cxl_cxims_data *cximsd;
+ unsigned int hbig, nr_maps;
+ int rc;
+
+ rc = eig_to_granularity(cxims->hbig, &hbig);
+ if (rc)
+ return rc;
+
+ /* Does this CXIMS entry apply to the given CXL Window? */
+ if (hbig != cxld->interleave_granularity)
+ return 0;
+
+ /* IW 1,3 do not use xormaps and skip this parsing entirely */
+ if (is_power_of_2(cxld->interleave_ways))
+ /* 2, 4, 8, 16 way */
+ nr_maps = ilog2(cxld->interleave_ways);
+ else
+ /* 6, 12 way */
+ nr_maps = ilog2(cxld->interleave_ways / 3);
+
+ if (cxims->nr_xormaps < nr_maps) {
+ dev_dbg(dev, "CXIMS nr_xormaps[%d] expected[%d]\n",
+ cxims->nr_xormaps, nr_maps);
+ return -ENXIO;
+ }
+
+ cximsd = devm_kzalloc(dev, struct_size(cximsd, xormaps, nr_maps),
+ GFP_KERNEL);
+ if (!cximsd)
+ return -ENOMEM;
+ cximsd->nr_maps = nr_maps;
+ memcpy(cximsd->xormaps, cxims->xormap_list,
+ nr_maps * sizeof(*cximsd->xormaps));
+ cxlrd->platform_data = cximsd;
+
+ return 0;
+}
static unsigned long cfmws_to_decoder_flags(int restrictions)
{
- unsigned long flags = 0;
+ unsigned long flags = CXL_DECODER_F_ENABLE;
- if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2)
+ if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_DEVMEM)
flags |= CXL_DECODER_F_TYPE2;
- if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE3)
+ if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_HOSTONLYMEM)
flags |= CXL_DECODER_F_TYPE3;
if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_VOLATILE)
flags |= CXL_DECODER_F_RAM;
@@ -35,10 +152,13 @@ static unsigned long cfmws_to_decoder_flags(int restrictions)
static int cxl_acpi_cfmws_verify(struct device *dev,
struct acpi_cedt_cfmws *cfmws)
{
- int expected_len;
+ int rc, expected_len;
+ unsigned int ways;
- if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO) {
- dev_err(dev, "CFMWS Unsupported Interleave Arithmetic\n");
+ if (cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_MODULO &&
+ cfmws->interleave_arithmetic != ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
+ dev_err(dev, "CFMWS Unknown Interleave Arithmetic: %d\n",
+ cfmws->interleave_arithmetic);
return -EINVAL;
}
@@ -52,8 +172,14 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
return -EINVAL;
}
- expected_len = struct_size((cfmws), interleave_targets,
- CFMWS_INTERLEAVE_WAYS(cfmws));
+ rc = eiw_to_ways(cfmws->interleave_ways, &ways);
+ if (rc) {
+ dev_err(dev, "CFMWS Interleave Ways (%d) invalid\n",
+ cfmws->interleave_ways);
+ return -EINVAL;
+ }
+
+ expected_len = struct_size(cfmws, interleave_targets, ways);
if (cfmws->header.length < expected_len) {
dev_err(dev, "CFMWS length %d less than expected %d\n",
@@ -68,282 +194,566 @@ static int cxl_acpi_cfmws_verify(struct device *dev,
return 0;
}
-static void cxl_add_cfmws_decoders(struct device *dev,
- struct cxl_port *root_port)
+/*
+ * Note, @dev must be the first member, see 'struct cxl_chbs_context'
+ * and mock_acpi_table_parse_cedt()
+ */
+struct cxl_cfmws_context {
+ struct device *dev;
+ struct cxl_port *root_port;
+ struct resource *cxl_res;
+ int id;
+};
+
+/**
+ * cxl_acpi_evaluate_qtg_dsm - Retrieve QTG ids via ACPI _DSM
+ * @handle: ACPI handle
+ * @coord: performance access coordinates
+ * @entries: number of QTG IDs to return
+ * @qos_class: int array provided by caller to return QTG IDs
+ *
+ * Return: number of QTG IDs returned, or -errno for errors
+ *
+ * Issue QTG _DSM with accompanied bandwidth and latency data in order to get
+ * the QTG IDs that are suitable for the performance point in order of most
+ * suitable to least suitable. Write back array of QTG IDs and return the
+ * actual number of QTG IDs written back.
+ */
+static int
+cxl_acpi_evaluate_qtg_dsm(acpi_handle handle, struct access_coordinate *coord,
+ int entries, int *qos_class)
{
- struct acpi_cedt_cfmws *cfmws;
- struct cxl_decoder *cxld;
- acpi_size len, cur = 0;
- void *cedt_subtable;
- unsigned long flags;
+ union acpi_object *out_obj, *out_buf, *obj;
+ union acpi_object in_array[4] = {
+ [0].integer = { ACPI_TYPE_INTEGER, coord->read_latency },
+ [1].integer = { ACPI_TYPE_INTEGER, coord->write_latency },
+ [2].integer = { ACPI_TYPE_INTEGER, coord->read_bandwidth },
+ [3].integer = { ACPI_TYPE_INTEGER, coord->write_bandwidth },
+ };
+ union acpi_object in_obj = {
+ .package = {
+ .type = ACPI_TYPE_PACKAGE,
+ .count = 4,
+ .elements = in_array,
+ },
+ };
+ int count, pkg_entries, i;
+ u16 max_qtg;
int rc;
- len = acpi_cedt->length - sizeof(*acpi_cedt);
- cedt_subtable = acpi_cedt + 1;
+ if (!entries)
+ return -EINVAL;
- while (cur < len) {
- struct acpi_cedt_header *c = cedt_subtable + cur;
+ out_obj = acpi_evaluate_dsm(handle, &acpi_cxl_qtg_id_guid, 1, 1, &in_obj);
+ if (!out_obj)
+ return -ENXIO;
- if (c->type != ACPI_CEDT_TYPE_CFMWS) {
- cur += c->length;
- continue;
- }
+ if (out_obj->type != ACPI_TYPE_PACKAGE) {
+ rc = -ENXIO;
+ goto out;
+ }
- cfmws = cedt_subtable + cur;
+ /* Check Max QTG ID */
+ obj = &out_obj->package.elements[0];
+ if (obj->type != ACPI_TYPE_INTEGER) {
+ rc = -ENXIO;
+ goto out;
+ }
- if (cfmws->header.length < sizeof(*cfmws)) {
- dev_warn_once(dev,
- "CFMWS entry skipped:invalid length:%u\n",
- cfmws->header.length);
- cur += c->length;
- continue;
- }
+ max_qtg = obj->integer.value;
- rc = cxl_acpi_cfmws_verify(dev, cfmws);
- if (rc) {
- dev_err(dev, "CFMWS range %#llx-%#llx not registered\n",
- cfmws->base_hpa, cfmws->base_hpa +
- cfmws->window_size - 1);
- cur += c->length;
- continue;
- }
+ /* It's legal to have 0 QTG entries */
+ pkg_entries = out_obj->package.count;
+ if (pkg_entries <= 1) {
+ rc = 0;
+ goto out;
+ }
- flags = cfmws_to_decoder_flags(cfmws->restrictions);
- cxld = devm_cxl_add_decoder(dev, root_port,
- CFMWS_INTERLEAVE_WAYS(cfmws),
- cfmws->base_hpa, cfmws->window_size,
- CFMWS_INTERLEAVE_WAYS(cfmws),
- CFMWS_INTERLEAVE_GRANULARITY(cfmws),
- CXL_DECODER_EXPANDER,
- flags);
-
- if (IS_ERR(cxld)) {
- dev_err(dev, "Failed to add decoder for %#llx-%#llx\n",
- cfmws->base_hpa, cfmws->base_hpa +
- cfmws->window_size - 1);
- } else {
- dev_dbg(dev, "add: %s range %#llx-%#llx\n",
- dev_name(&cxld->dev), cfmws->base_hpa,
- cfmws->base_hpa + cfmws->window_size - 1);
+ /* Retrieve QTG IDs package */
+ obj = &out_obj->package.elements[1];
+ if (obj->type != ACPI_TYPE_PACKAGE) {
+ rc = -ENXIO;
+ goto out;
+ }
+
+ pkg_entries = obj->package.count;
+ count = min(entries, pkg_entries);
+ for (i = 0; i < count; i++) {
+ u16 qtg_id;
+
+ out_buf = &obj->package.elements[i];
+ if (out_buf->type != ACPI_TYPE_INTEGER) {
+ rc = -ENXIO;
+ goto out;
}
- cur += c->length;
+
+ qtg_id = out_buf->integer.value;
+ if (qtg_id > max_qtg)
+ pr_warn("QTG ID %u greater than MAX %u\n",
+ qtg_id, max_qtg);
+
+ qos_class[i] = qtg_id;
}
+ rc = count;
+
+out:
+ ACPI_FREE(out_obj);
+ return rc;
}
-static struct acpi_cedt_chbs *cxl_acpi_match_chbs(struct device *dev, u32 uid)
+static int cxl_acpi_qos_class(struct cxl_root *cxl_root,
+ struct access_coordinate *coord, int entries,
+ int *qos_class)
{
- struct acpi_cedt_chbs *chbs, *chbs_match = NULL;
- acpi_size len, cur = 0;
- void *cedt_subtable;
+ struct device *dev = cxl_root->port.uport_dev;
+ acpi_handle handle;
- len = acpi_cedt->length - sizeof(*acpi_cedt);
- cedt_subtable = acpi_cedt + 1;
+ if (!dev_is_platform(dev))
+ return -ENODEV;
- while (cur < len) {
- struct acpi_cedt_header *c = cedt_subtable + cur;
+ handle = ACPI_HANDLE(dev);
+ if (!handle)
+ return -ENODEV;
- if (c->type != ACPI_CEDT_TYPE_CHBS) {
- cur += c->length;
- continue;
- }
+ return cxl_acpi_evaluate_qtg_dsm(handle, coord, entries, qos_class);
+}
- chbs = cedt_subtable + cur;
+static const struct cxl_root_ops acpi_root_ops = {
+ .qos_class = cxl_acpi_qos_class,
+};
- if (chbs->header.length < sizeof(*chbs)) {
- dev_warn_once(dev,
- "CHBS entry skipped: invalid length:%u\n",
- chbs->header.length);
- cur += c->length;
- continue;
- }
+static void del_cxl_resource(struct resource *res)
+{
+ if (!res)
+ return;
+ kfree(res->name);
+ kfree(res);
+}
- if (chbs->uid != uid) {
- cur += c->length;
- continue;
- }
+static struct resource *alloc_cxl_resource(resource_size_t base,
+ resource_size_t n, int id)
+{
+ struct resource *res __free(kfree) = kzalloc(sizeof(*res), GFP_KERNEL);
- if (chbs_match) {
- dev_warn_once(dev,
- "CHBS entry skipped: duplicate UID:%u\n",
- uid);
- cur += c->length;
- continue;
- }
+ if (!res)
+ return NULL;
- chbs_match = chbs;
- cur += c->length;
- }
+ res->start = base;
+ res->end = base + n - 1;
+ res->flags = IORESOURCE_MEM;
+ res->name = kasprintf(GFP_KERNEL, "CXL Window %d", id);
+ if (!res->name)
+ return NULL;
- return chbs_match ? chbs_match : ERR_PTR(-ENODEV);
+ return no_free_ptr(res);
}
-static resource_size_t get_chbcr(struct acpi_cedt_chbs *chbs)
+static int add_or_reset_cxl_resource(struct resource *parent, struct resource *res)
{
- return IS_ERR(chbs) ? CXL_RESOURCE_NONE : chbs->base;
+ int rc = insert_resource(parent, res);
+
+ if (rc)
+ del_cxl_resource(res);
+ return rc;
}
-struct cxl_walk_context {
- struct device *dev;
- struct pci_bus *root;
- struct cxl_port *port;
- int error;
- int count;
-};
+static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd)
+{
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct range *hpa = &cxld->hpa_range;
+ resource_size_t size = range_len(hpa);
+ resource_size_t start = hpa->start;
+ resource_size_t cache_size;
+ struct resource res;
+ int nid, rc;
+
+ res = DEFINE_RES_MEM(start, size);
+ nid = phys_to_target_node(start);
+
+ rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size);
+ if (rc)
+ return 0;
+
+ /*
+ * The cache range is expected to be within the CFMWS.
+ * Currently there is only support cache_size == cxl_size. CXL
+ * size is then half of the total CFMWS window size.
+ */
+ size = size >> 1;
+ if (cache_size && size != cache_size) {
+ dev_warn(&cxld->dev,
+ "Extended Linear Cache size %pa != CXL size %pa. No Support!",
+ &cache_size, &size);
+ return -ENXIO;
+ }
-static int match_add_root_ports(struct pci_dev *pdev, void *data)
+ cxlrd->cache_size = cache_size;
+
+ return 0;
+}
+
+static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd)
+{
+ int rc;
+
+ rc = cxl_acpi_set_cache_size(cxlrd);
+ if (rc) {
+ /*
+ * Failing to retrieve extended linear cache region resize does not
+ * prevent the region from functioning. Only causes cxl list showing
+ * incorrect region size.
+ */
+ dev_warn(cxlrd->cxlsd.cxld.dev.parent,
+ "Extended linear cache retrieval failed rc:%d\n", rc);
+
+ /* Ignoring return code */
+ cxlrd->cache_size = 0;
+ }
+}
+
+DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *,
+ if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev))
+DEFINE_FREE(del_cxl_resource, struct resource *, if (_T) del_cxl_resource(_T))
+static int __cxl_parse_cfmws(struct acpi_cedt_cfmws *cfmws,
+ struct cxl_cfmws_context *ctx)
{
- struct cxl_walk_context *ctx = data;
- struct pci_bus *root_bus = ctx->root;
- struct cxl_port *port = ctx->port;
- int type = pci_pcie_type(pdev);
+ struct cxl_port *root_port = ctx->root_port;
+ struct cxl_cxims_context cxims_ctx;
struct device *dev = ctx->dev;
- u32 lnkcap, port_num;
+ struct cxl_decoder *cxld;
+ unsigned int ways, i, ig;
int rc;
- if (pdev->bus != root_bus)
- return 0;
- if (!pci_is_pcie(pdev))
- return 0;
- if (type != PCI_EXP_TYPE_ROOT_PORT)
- return 0;
- if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP,
- &lnkcap) != PCIBIOS_SUCCESSFUL)
- return 0;
+ rc = cxl_acpi_cfmws_verify(dev, cfmws);
+ if (rc)
+ return rc;
- /* TODO walk DVSEC to find component register base */
- port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
- rc = cxl_add_dport(port, &pdev->dev, port_num, CXL_RESOURCE_NONE);
- if (rc) {
- ctx->error = rc;
+ rc = eiw_to_ways(cfmws->interleave_ways, &ways);
+ if (rc)
return rc;
+ rc = eig_to_granularity(cfmws->granularity, &ig);
+ if (rc)
+ return rc;
+
+ struct resource *res __free(del_cxl_resource) = alloc_cxl_resource(
+ cfmws->base_hpa, cfmws->window_size, ctx->id++);
+ if (!res)
+ return -ENOMEM;
+
+ /* add to the local resource tracking to establish a sort order */
+ rc = add_or_reset_cxl_resource(ctx->cxl_res, no_free_ptr(res));
+ if (rc)
+ return rc;
+
+ struct cxl_root_decoder *cxlrd __free(put_cxlrd) =
+ cxl_root_decoder_alloc(root_port, ways);
+
+ if (IS_ERR(cxlrd))
+ return PTR_ERR(cxlrd);
+
+ cxld = &cxlrd->cxlsd.cxld;
+ cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions);
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ cxld->hpa_range = (struct range) {
+ .start = cfmws->base_hpa,
+ .end = cfmws->base_hpa + cfmws->window_size - 1,
+ };
+ cxld->interleave_ways = ways;
+ for (i = 0; i < ways; i++)
+ cxld->target_map[i] = cfmws->interleave_targets[i];
+ /*
+ * Minimize the x1 granularity to advertise support for any
+ * valid region granularity
+ */
+ if (ways == 1)
+ ig = CXL_DECODER_MIN_GRANULARITY;
+ cxld->interleave_granularity = ig;
+
+ if (cfmws->interleave_arithmetic == ACPI_CEDT_CFMWS_ARITHMETIC_XOR) {
+ if (ways != 1 && ways != 3) {
+ cxims_ctx = (struct cxl_cxims_context) {
+ .dev = dev,
+ .cxlrd = cxlrd,
+ };
+ rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CXIMS,
+ cxl_parse_cxims, &cxims_ctx);
+ if (rc < 0)
+ return rc;
+ if (!cxlrd->platform_data) {
+ dev_err(dev, "No CXIMS for HBIG %u\n", ig);
+ return -EINVAL;
+ }
+ }
+ cxlrd->ops.hpa_to_spa = cxl_apply_xor_maps;
+ cxlrd->ops.spa_to_hpa = cxl_apply_xor_maps;
}
- ctx->count++;
- dev_dbg(dev, "add dport%d: %s\n", port_num, dev_name(&pdev->dev));
+ cxl_setup_extended_linear_cache(cxlrd);
+
+ cxlrd->qos_class = cfmws->qtg_id;
+
+ rc = cxl_decoder_add(cxld);
+ if (rc)
+ return rc;
+
+ rc = cxl_root_decoder_autoremove(dev, no_free_ptr(cxlrd));
+ if (rc)
+ return rc;
+
+ dev_dbg(root_port->dev.parent, "%s added to %s\n",
+ dev_name(&cxld->dev), dev_name(&root_port->dev));
return 0;
}
-static struct cxl_dport *find_dport_by_dev(struct cxl_port *port, struct device *dev)
+static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
{
- struct cxl_dport *dport;
-
- device_lock(&port->dev);
- list_for_each_entry(dport, &port->dports, list)
- if (dport->dport == dev) {
- device_unlock(&port->dev);
- return dport;
- }
+ struct acpi_cedt_cfmws *cfmws = (struct acpi_cedt_cfmws *)header;
+ struct cxl_cfmws_context *ctx = arg;
+ struct device *dev = ctx->dev;
+ int rc;
- device_unlock(&port->dev);
- return NULL;
+ rc = __cxl_parse_cfmws(cfmws, ctx);
+ if (rc)
+ dev_err(dev,
+ "Failed to add decode range: [%#llx - %#llx] (%d)\n",
+ cfmws->base_hpa,
+ cfmws->base_hpa + cfmws->window_size - 1, rc);
+ else
+ dev_dbg(dev, "decode range: node: %d range [%#llx - %#llx]\n",
+ phys_to_target_node(cfmws->base_hpa), cfmws->base_hpa,
+ cfmws->base_hpa + cfmws->window_size - 1);
+
+ /* never fail cxl_acpi load for a single window failure */
+ return 0;
}
-static struct acpi_device *to_cxl_host_bridge(struct device *dev)
+__mock struct acpi_device *to_cxl_host_bridge(struct device *host,
+ struct device *dev)
{
struct acpi_device *adev = to_acpi_device(dev);
+ if (!acpi_pci_find_root(adev->handle))
+ return NULL;
+
if (strcmp(acpi_device_hid(adev), "ACPI0016") == 0)
return adev;
return NULL;
}
-/*
- * A host bridge is a dport to a CFMWS decode and it is a uport to the
- * dport (PCIe Root Ports) in the host bridge.
- */
-static int add_host_bridge_uport(struct device *match, void *arg)
+/* Note, @dev is used by mock_acpi_table_parse_cedt() */
+struct cxl_chbs_context {
+ struct device *dev;
+ unsigned long long uid;
+ resource_size_t base;
+ u32 cxl_version;
+ int nr_versions;
+ u32 saved_version;
+};
+
+static int cxl_get_chbs_iter(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
{
- struct acpi_device *bridge = to_cxl_host_bridge(match);
- struct cxl_port *root_port = arg;
- struct device *host = root_port->dev.parent;
- struct acpi_pci_root *pci_root;
- struct cxl_walk_context ctx;
- struct cxl_decoder *cxld;
- struct cxl_dport *dport;
- struct cxl_port *port;
+ struct cxl_chbs_context *ctx = arg;
+ struct acpi_cedt_chbs *chbs;
+
+ chbs = (struct acpi_cedt_chbs *) header;
- if (!bridge)
+ if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11 &&
+ chbs->length != ACPI_CEDT_CHBS_LENGTH_CXL11)
return 0;
- pci_root = acpi_pci_find_root(bridge->handle);
- if (!pci_root)
- return -ENXIO;
+ if (chbs->cxl_version == ACPI_CEDT_CHBS_VERSION_CXL20 &&
+ chbs->length != ACPI_CEDT_CHBS_LENGTH_CXL20)
+ return 0;
- dport = find_dport_by_dev(root_port, match);
- if (!dport) {
- dev_dbg(host, "host bridge expected and not found\n");
- return -ENODEV;
+ if (!chbs->base)
+ return 0;
+
+ if (ctx->saved_version != chbs->cxl_version) {
+ /*
+ * cxl_version cannot be overwritten before the next two
+ * checks, then use saved_version
+ */
+ ctx->saved_version = chbs->cxl_version;
+ ctx->nr_versions++;
}
- port = devm_cxl_add_port(host, match, dport->component_reg_phys,
- root_port);
- if (IS_ERR(port))
- return PTR_ERR(port);
- dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev));
+ if (ctx->base != CXL_RESOURCE_NONE)
+ return 0;
- ctx = (struct cxl_walk_context){
- .dev = host,
- .root = pci_root->bus,
- .port = port,
+ if (ctx->uid != chbs->uid)
+ return 0;
+
+ ctx->cxl_version = chbs->cxl_version;
+ ctx->base = chbs->base;
+
+ return 0;
+}
+
+static int cxl_get_chbs(struct device *dev, struct acpi_device *hb,
+ struct cxl_chbs_context *ctx)
+{
+ unsigned long long uid;
+ int rc;
+
+ rc = acpi_evaluate_integer(hb->handle, METHOD_NAME__UID, NULL, &uid);
+ if (rc != AE_OK) {
+ dev_err(dev, "unable to retrieve _UID\n");
+ return -ENOENT;
+ }
+
+ dev_dbg(dev, "UID found: %lld\n", uid);
+ *ctx = (struct cxl_chbs_context) {
+ .dev = dev,
+ .uid = uid,
+ .base = CXL_RESOURCE_NONE,
+ .cxl_version = UINT_MAX,
+ .saved_version = UINT_MAX,
};
- pci_walk_bus(pci_root->bus, match_add_root_ports, &ctx);
- if (ctx.count == 0)
- return -ENODEV;
- if (ctx.error)
- return ctx.error;
+ acpi_table_parse_cedt(ACPI_CEDT_TYPE_CHBS, cxl_get_chbs_iter, ctx);
+
+ if (ctx->nr_versions > 1) {
+ /*
+ * Disclaim eRCD support given some component register may
+ * only be found via CHBCR
+ */
+ dev_info(dev, "Unsupported platform config, mixed Virtual Host and Restricted CXL Host hierarchy.");
+ }
+
+ return 0;
+}
+
+static int get_genport_coordinates(struct device *dev, struct cxl_dport *dport)
+{
+ struct acpi_device *hb = to_cxl_host_bridge(NULL, dev);
+ u32 uid;
+
+ if (kstrtou32(acpi_device_uid(hb), 0, &uid))
+ return -EINVAL;
+
+ return acpi_get_genport_coordinates(uid, dport->coord);
+}
+
+static int add_host_bridge_dport(struct device *match, void *arg)
+{
+ int ret;
+ acpi_status rc;
+ struct device *bridge;
+ struct cxl_dport *dport;
+ struct cxl_chbs_context ctx;
+ struct acpi_pci_root *pci_root;
+ struct cxl_port *root_port = arg;
+ struct device *host = root_port->dev.parent;
+ struct acpi_device *hb = to_cxl_host_bridge(host, match);
- /* TODO: Scan CHBCR for HDM Decoder resources */
+ if (!hb)
+ return 0;
+
+ rc = cxl_get_chbs(match, hb, &ctx);
+ if (rc)
+ return rc;
+
+ if (ctx.cxl_version == UINT_MAX) {
+ dev_warn(match, "No CHBS found for Host Bridge (UID %lld)\n",
+ ctx.uid);
+ return 0;
+ }
+
+ if (ctx.base == CXL_RESOURCE_NONE) {
+ dev_warn(match, "CHBS invalid for Host Bridge (UID %lld)\n",
+ ctx.uid);
+ return 0;
+ }
+
+ pci_root = acpi_pci_find_root(hb->handle);
+ bridge = pci_root->bus->bridge;
/*
- * In the single-port host-bridge case there are no HDM decoders
- * in the CHBCR and a 1:1 passthrough decode is implied.
+ * In RCH mode, bind the component regs base to the dport. In
+ * VH mode it will be bound to the CXL host bridge's port
+ * object later in add_host_bridge_uport().
*/
- if (ctx.count == 1) {
- cxld = devm_cxl_add_passthrough_decoder(host, port);
- if (IS_ERR(cxld))
- return PTR_ERR(cxld);
-
- dev_dbg(host, "add: %s\n", dev_name(&cxld->dev));
+ if (ctx.cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11) {
+ dev_dbg(match, "RCRB found for UID %lld: %pa\n", ctx.uid,
+ &ctx.base);
+ dport = devm_cxl_add_rch_dport(root_port, bridge, ctx.uid,
+ ctx.base);
+ } else {
+ dport = devm_cxl_add_dport(root_port, bridge, ctx.uid,
+ CXL_RESOURCE_NONE);
}
+ if (IS_ERR(dport))
+ return PTR_ERR(dport);
+
+ ret = get_genport_coordinates(match, dport);
+ if (ret)
+ dev_dbg(match, "Failed to get generic port perf coordinates.\n");
+
return 0;
}
-static int add_host_bridge_dport(struct device *match, void *arg)
+/*
+ * A host bridge is a dport to a CFMWS decode and it is a uport to the
+ * dport (PCIe Root Ports) in the host bridge.
+ */
+static int add_host_bridge_uport(struct device *match, void *arg)
{
- int rc;
- acpi_status status;
- unsigned long long uid;
- struct acpi_cedt_chbs *chbs;
struct cxl_port *root_port = arg;
struct device *host = root_port->dev.parent;
- struct acpi_device *bridge = to_cxl_host_bridge(match);
+ struct acpi_device *hb = to_cxl_host_bridge(host, match);
+ struct acpi_pci_root *pci_root;
+ struct cxl_dport *dport;
+ struct cxl_port *port;
+ struct device *bridge;
+ struct cxl_chbs_context ctx;
+ resource_size_t component_reg_phys;
+ int rc;
- if (!bridge)
+ if (!hb)
return 0;
- status = acpi_evaluate_integer(bridge->handle, METHOD_NAME__UID, NULL,
- &uid);
- if (status != AE_OK) {
- dev_err(host, "unable to retrieve _UID of %s\n",
- dev_name(match));
- return -ENODEV;
+ pci_root = acpi_pci_find_root(hb->handle);
+ bridge = pci_root->bus->bridge;
+ dport = cxl_find_dport_by_dev(root_port, bridge);
+ if (!dport) {
+ dev_dbg(host, "host bridge expected and not found\n");
+ return 0;
}
- chbs = cxl_acpi_match_chbs(host, uid);
- if (IS_ERR(chbs))
- dev_dbg(host, "No CHBS found for Host Bridge: %s\n",
- dev_name(match));
+ if (dport->rch) {
+ dev_info(bridge, "host supports CXL (restricted)\n");
+ return 0;
+ }
- rc = cxl_add_dport(root_port, match, uid, get_chbcr(chbs));
- if (rc) {
- dev_err(host, "failed to add downstream port: %s\n",
- dev_name(match));
+ rc = cxl_get_chbs(match, hb, &ctx);
+ if (rc)
return rc;
+
+ if (ctx.cxl_version == ACPI_CEDT_CHBS_VERSION_CXL11) {
+ dev_warn(bridge,
+ "CXL CHBS version mismatch, skip port registration\n");
+ return 0;
}
- dev_dbg(host, "add dport%llu: %s\n", uid, dev_name(match));
+
+ component_reg_phys = ctx.base;
+ if (component_reg_phys != CXL_RESOURCE_NONE)
+ dev_dbg(match, "CHBCR found for UID %lld: %pa\n",
+ ctx.uid, &component_reg_phys);
+
+ rc = devm_cxl_register_pci_bus(host, bridge, pci_root->bus);
+ if (rc)
+ return rc;
+
+ port = devm_cxl_add_port(host, bridge, component_reg_phys, dport);
+ if (IS_ERR(port))
+ return PTR_ERR(port);
+
+ dev_info(bridge, "host supports CXL\n");
+
return 0;
}
@@ -371,29 +781,180 @@ static int add_root_nvdimm_bridge(struct device *match, void *data)
return 1;
}
+static struct lock_class_key cxl_root_key;
+
+static void cxl_acpi_lock_reset_class(void *dev)
+{
+ device_lock_reset_class(dev);
+}
+
+static void cxl_set_public_resource(struct resource *priv, struct resource *pub)
+{
+ priv->desc = (unsigned long) pub;
+}
+
+static struct resource *cxl_get_public_resource(struct resource *priv)
+{
+ return (struct resource *) priv->desc;
+}
+
+static void remove_cxl_resources(void *data)
+{
+ struct resource *res, *next, *cxl = data;
+
+ for (res = cxl->child; res; res = next) {
+ struct resource *victim = cxl_get_public_resource(res);
+
+ next = res->sibling;
+ remove_resource(res);
+
+ if (victim) {
+ remove_resource(victim);
+ kfree(victim);
+ }
+
+ del_cxl_resource(res);
+ }
+}
+
+/**
+ * add_cxl_resources() - reflect CXL fixed memory windows in iomem_resource
+ * @cxl_res: A standalone resource tree where each CXL window is a sibling
+ *
+ * Walk each CXL window in @cxl_res and add it to iomem_resource potentially
+ * expanding its boundaries to ensure that any conflicting resources become
+ * children. If a window is expanded it may then conflict with a another window
+ * entry and require the window to be truncated or trimmed. Consider this
+ * situation::
+ *
+ * |-- "CXL Window 0" --||----- "CXL Window 1" -----|
+ * |--------------- "System RAM" -------------|
+ *
+ * ...where platform firmware has established as System RAM resource across 2
+ * windows, but has left some portion of window 1 for dynamic CXL region
+ * provisioning. In this case "Window 0" will span the entirety of the "System
+ * RAM" span, and "CXL Window 1" is truncated to the remaining tail past the end
+ * of that "System RAM" resource.
+ */
+static int add_cxl_resources(struct resource *cxl_res)
+{
+ struct resource *res, *new, *next;
+
+ for (res = cxl_res->child; res; res = next) {
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->name = res->name;
+ new->start = res->start;
+ new->end = res->end;
+ new->flags = IORESOURCE_MEM;
+ new->desc = IORES_DESC_CXL;
+
+ /*
+ * Record the public resource in the private cxl_res tree for
+ * later removal.
+ */
+ cxl_set_public_resource(res, new);
+
+ insert_resource_expand_to_fit(&iomem_resource, new);
+
+ next = res->sibling;
+ while (next && resource_overlaps(new, next)) {
+ if (resource_contains(new, next)) {
+ struct resource *_next = next->sibling;
+
+ remove_resource(next);
+ del_cxl_resource(next);
+ next = _next;
+ } else
+ next->start = new->end + 1;
+ }
+ }
+ return 0;
+}
+
+static int pair_cxl_resource(struct device *dev, void *data)
+{
+ struct resource *cxl_res = data;
+ struct resource *p;
+
+ if (!is_root_decoder(dev))
+ return 0;
+
+ for (p = cxl_res->child; p; p = p->sibling) {
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct resource res = {
+ .start = cxld->hpa_range.start,
+ .end = cxld->hpa_range.end,
+ .flags = IORESOURCE_MEM,
+ };
+
+ if (resource_contains(p, &res)) {
+ cxlrd->res = cxl_get_public_resource(p);
+ break;
+ }
+ }
+
+ return 0;
+}
+
static int cxl_acpi_probe(struct platform_device *pdev)
{
int rc;
- acpi_status status;
+ struct resource *cxl_res;
+ struct cxl_root *cxl_root;
struct cxl_port *root_port;
struct device *host = &pdev->dev;
struct acpi_device *adev = ACPI_COMPANION(host);
+ struct cxl_cfmws_context ctx;
- root_port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
- if (IS_ERR(root_port))
- return PTR_ERR(root_port);
- dev_dbg(host, "add: %s\n", dev_name(&root_port->dev));
+ device_lock_set_class(&pdev->dev, &cxl_root_key);
+ rc = devm_add_action_or_reset(&pdev->dev, cxl_acpi_lock_reset_class,
+ &pdev->dev);
+ if (rc)
+ return rc;
- status = acpi_get_table(ACPI_SIG_CEDT, 0, &acpi_cedt);
- if (ACPI_FAILURE(status))
- return -ENXIO;
+ cxl_res = devm_kzalloc(host, sizeof(*cxl_res), GFP_KERNEL);
+ if (!cxl_res)
+ return -ENOMEM;
+ cxl_res->name = "CXL mem";
+ cxl_res->start = 0;
+ cxl_res->end = -1;
+ cxl_res->flags = IORESOURCE_MEM;
+
+ cxl_root = devm_cxl_add_root(host, &acpi_root_ops);
+ if (IS_ERR(cxl_root))
+ return PTR_ERR(cxl_root);
+ root_port = &cxl_root->port;
rc = bus_for_each_dev(adev->dev.bus, NULL, root_port,
add_host_bridge_dport);
+ if (rc < 0)
+ return rc;
+
+ rc = devm_add_action_or_reset(host, remove_cxl_resources, cxl_res);
if (rc)
- goto out;
+ return rc;
+
+ ctx = (struct cxl_cfmws_context) {
+ .dev = host,
+ .root_port = root_port,
+ .cxl_res = cxl_res,
+ };
+ rc = acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, cxl_parse_cfmws, &ctx);
+ if (rc < 0)
+ return -ENXIO;
- cxl_add_cfmws_decoders(host, root_port);
+ rc = add_cxl_resources(cxl_res);
+ if (rc)
+ return rc;
+
+ /*
+ * Populate the root decoders with their related iomem resource,
+ * if present
+ */
+ device_for_each_child(&root_port->dev, cxl_res, pair_cxl_resource);
/*
* Root level scanned with host-bridge as dports, now scan host-bridges
@@ -401,34 +962,63 @@ static int cxl_acpi_probe(struct platform_device *pdev)
*/
rc = bus_for_each_dev(adev->dev.bus, NULL, root_port,
add_host_bridge_uport);
- if (rc)
- goto out;
+ if (rc < 0)
+ return rc;
if (IS_ENABLED(CONFIG_CXL_PMEM))
rc = device_for_each_child(&root_port->dev, root_port,
add_root_nvdimm_bridge);
-
-out:
- acpi_put_table(acpi_cedt);
if (rc < 0)
return rc;
+
+ /* In case PCI is scanned before ACPI re-trigger memdev attach */
+ cxl_bus_rescan();
return 0;
}
static const struct acpi_device_id cxl_acpi_ids[] = {
- { "ACPI0017", 0 },
- { "", 0 },
+ { "ACPI0017" },
+ { },
};
MODULE_DEVICE_TABLE(acpi, cxl_acpi_ids);
+static const struct platform_device_id cxl_test_ids[] = {
+ { "cxl_acpi" },
+ { },
+};
+MODULE_DEVICE_TABLE(platform, cxl_test_ids);
+
static struct platform_driver cxl_acpi_driver = {
.probe = cxl_acpi_probe,
.driver = {
.name = KBUILD_MODNAME,
.acpi_match_table = cxl_acpi_ids,
},
+ .id_table = cxl_test_ids,
};
-module_platform_driver(cxl_acpi_driver);
+static int __init cxl_acpi_init(void)
+{
+ return platform_driver_register(&cxl_acpi_driver);
+}
+
+static void __exit cxl_acpi_exit(void)
+{
+ platform_driver_unregister(&cxl_acpi_driver);
+ cxl_bus_drain();
+}
+
+/* load before dax_hmem sees 'Soft Reserved' CXL ranges */
+subsys_initcall(cxl_acpi_init);
+
+/*
+ * Arrange for host-bridge ports to be active synchronous with
+ * cxl_acpi_probe() exit.
+ */
+MODULE_SOFTDEP("pre: cxl_port");
+
+module_exit(cxl_acpi_exit);
+MODULE_DESCRIPTION("CXL ACPI: Platform Support");
MODULE_LICENSE("GPL v2");
-MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS("CXL");
+MODULE_IMPORT_NS("ACPI");
diff --git a/drivers/cxl/core.c b/drivers/cxl/core.c
deleted file mode 100644
index a2e4d54fc7bc..000000000000
--- a/drivers/cxl/core.c
+++ /dev/null
@@ -1,1067 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
-#include <linux/io-64-nonatomic-lo-hi.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/idr.h>
-#include "cxl.h"
-#include "mem.h"
-
-/**
- * DOC: cxl core
- *
- * The CXL core provides a sysfs hierarchy for control devices and a rendezvous
- * point for cross-device interleave coordination through cxl ports.
- */
-
-static DEFINE_IDA(cxl_port_ida);
-
-static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return sysfs_emit(buf, "%s\n", dev->type->name);
-}
-static DEVICE_ATTR_RO(devtype);
-
-static struct attribute *cxl_base_attributes[] = {
- &dev_attr_devtype.attr,
- NULL,
-};
-
-static struct attribute_group cxl_base_attribute_group = {
- .attrs = cxl_base_attributes,
-};
-
-static ssize_t start_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct cxl_decoder *cxld = to_cxl_decoder(dev);
-
- return sysfs_emit(buf, "%#llx\n", cxld->range.start);
-}
-static DEVICE_ATTR_RO(start);
-
-static ssize_t size_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct cxl_decoder *cxld = to_cxl_decoder(dev);
-
- return sysfs_emit(buf, "%#llx\n", range_len(&cxld->range));
-}
-static DEVICE_ATTR_RO(size);
-
-#define CXL_DECODER_FLAG_ATTR(name, flag) \
-static ssize_t name##_show(struct device *dev, \
- struct device_attribute *attr, char *buf) \
-{ \
- struct cxl_decoder *cxld = to_cxl_decoder(dev); \
- \
- return sysfs_emit(buf, "%s\n", \
- (cxld->flags & (flag)) ? "1" : "0"); \
-} \
-static DEVICE_ATTR_RO(name)
-
-CXL_DECODER_FLAG_ATTR(cap_pmem, CXL_DECODER_F_PMEM);
-CXL_DECODER_FLAG_ATTR(cap_ram, CXL_DECODER_F_RAM);
-CXL_DECODER_FLAG_ATTR(cap_type2, CXL_DECODER_F_TYPE2);
-CXL_DECODER_FLAG_ATTR(cap_type3, CXL_DECODER_F_TYPE3);
-CXL_DECODER_FLAG_ATTR(locked, CXL_DECODER_F_LOCK);
-
-static ssize_t target_type_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct cxl_decoder *cxld = to_cxl_decoder(dev);
-
- switch (cxld->target_type) {
- case CXL_DECODER_ACCELERATOR:
- return sysfs_emit(buf, "accelerator\n");
- case CXL_DECODER_EXPANDER:
- return sysfs_emit(buf, "expander\n");
- }
- return -ENXIO;
-}
-static DEVICE_ATTR_RO(target_type);
-
-static ssize_t target_list_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct cxl_decoder *cxld = to_cxl_decoder(dev);
- ssize_t offset = 0;
- int i, rc = 0;
-
- device_lock(dev);
- for (i = 0; i < cxld->interleave_ways; i++) {
- struct cxl_dport *dport = cxld->target[i];
- struct cxl_dport *next = NULL;
-
- if (!dport)
- break;
-
- if (i + 1 < cxld->interleave_ways)
- next = cxld->target[i + 1];
- rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id,
- next ? "," : "");
- if (rc < 0)
- break;
- offset += rc;
- }
- device_unlock(dev);
-
- if (rc < 0)
- return rc;
-
- rc = sysfs_emit_at(buf, offset, "\n");
- if (rc < 0)
- return rc;
-
- return offset + rc;
-}
-static DEVICE_ATTR_RO(target_list);
-
-static struct attribute *cxl_decoder_base_attrs[] = {
- &dev_attr_start.attr,
- &dev_attr_size.attr,
- &dev_attr_locked.attr,
- &dev_attr_target_list.attr,
- NULL,
-};
-
-static struct attribute_group cxl_decoder_base_attribute_group = {
- .attrs = cxl_decoder_base_attrs,
-};
-
-static struct attribute *cxl_decoder_root_attrs[] = {
- &dev_attr_cap_pmem.attr,
- &dev_attr_cap_ram.attr,
- &dev_attr_cap_type2.attr,
- &dev_attr_cap_type3.attr,
- NULL,
-};
-
-static struct attribute_group cxl_decoder_root_attribute_group = {
- .attrs = cxl_decoder_root_attrs,
-};
-
-static const struct attribute_group *cxl_decoder_root_attribute_groups[] = {
- &cxl_decoder_root_attribute_group,
- &cxl_decoder_base_attribute_group,
- &cxl_base_attribute_group,
- NULL,
-};
-
-static struct attribute *cxl_decoder_switch_attrs[] = {
- &dev_attr_target_type.attr,
- NULL,
-};
-
-static struct attribute_group cxl_decoder_switch_attribute_group = {
- .attrs = cxl_decoder_switch_attrs,
-};
-
-static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = {
- &cxl_decoder_switch_attribute_group,
- &cxl_decoder_base_attribute_group,
- &cxl_base_attribute_group,
- NULL,
-};
-
-static void cxl_decoder_release(struct device *dev)
-{
- struct cxl_decoder *cxld = to_cxl_decoder(dev);
- struct cxl_port *port = to_cxl_port(dev->parent);
-
- ida_free(&port->decoder_ida, cxld->id);
- kfree(cxld);
-}
-
-static const struct device_type cxl_decoder_switch_type = {
- .name = "cxl_decoder_switch",
- .release = cxl_decoder_release,
- .groups = cxl_decoder_switch_attribute_groups,
-};
-
-static const struct device_type cxl_decoder_root_type = {
- .name = "cxl_decoder_root",
- .release = cxl_decoder_release,
- .groups = cxl_decoder_root_attribute_groups,
-};
-
-bool is_root_decoder(struct device *dev)
-{
- return dev->type == &cxl_decoder_root_type;
-}
-EXPORT_SYMBOL_GPL(is_root_decoder);
-
-struct cxl_decoder *to_cxl_decoder(struct device *dev)
-{
- if (dev_WARN_ONCE(dev, dev->type->release != cxl_decoder_release,
- "not a cxl_decoder device\n"))
- return NULL;
- return container_of(dev, struct cxl_decoder, dev);
-}
-EXPORT_SYMBOL_GPL(to_cxl_decoder);
-
-static void cxl_dport_release(struct cxl_dport *dport)
-{
- list_del(&dport->list);
- put_device(dport->dport);
- kfree(dport);
-}
-
-static void cxl_port_release(struct device *dev)
-{
- struct cxl_port *port = to_cxl_port(dev);
- struct cxl_dport *dport, *_d;
-
- device_lock(dev);
- list_for_each_entry_safe(dport, _d, &port->dports, list)
- cxl_dport_release(dport);
- device_unlock(dev);
- ida_free(&cxl_port_ida, port->id);
- kfree(port);
-}
-
-static const struct attribute_group *cxl_port_attribute_groups[] = {
- &cxl_base_attribute_group,
- NULL,
-};
-
-static const struct device_type cxl_port_type = {
- .name = "cxl_port",
- .release = cxl_port_release,
- .groups = cxl_port_attribute_groups,
-};
-
-struct cxl_port *to_cxl_port(struct device *dev)
-{
- if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type,
- "not a cxl_port device\n"))
- return NULL;
- return container_of(dev, struct cxl_port, dev);
-}
-
-static void unregister_port(void *_port)
-{
- struct cxl_port *port = _port;
- struct cxl_dport *dport;
-
- device_lock(&port->dev);
- list_for_each_entry(dport, &port->dports, list) {
- char link_name[CXL_TARGET_STRLEN];
-
- if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d",
- dport->port_id) >= CXL_TARGET_STRLEN)
- continue;
- sysfs_remove_link(&port->dev.kobj, link_name);
- }
- device_unlock(&port->dev);
- device_unregister(&port->dev);
-}
-
-static void cxl_unlink_uport(void *_port)
-{
- struct cxl_port *port = _port;
-
- sysfs_remove_link(&port->dev.kobj, "uport");
-}
-
-static int devm_cxl_link_uport(struct device *host, struct cxl_port *port)
-{
- int rc;
-
- rc = sysfs_create_link(&port->dev.kobj, &port->uport->kobj, "uport");
- if (rc)
- return rc;
- return devm_add_action_or_reset(host, cxl_unlink_uport, port);
-}
-
-static struct cxl_port *cxl_port_alloc(struct device *uport,
- resource_size_t component_reg_phys,
- struct cxl_port *parent_port)
-{
- struct cxl_port *port;
- struct device *dev;
- int rc;
-
- port = kzalloc(sizeof(*port), GFP_KERNEL);
- if (!port)
- return ERR_PTR(-ENOMEM);
-
- rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
- if (rc < 0)
- goto err;
- port->id = rc;
-
- /*
- * The top-level cxl_port "cxl_root" does not have a cxl_port as
- * its parent and it does not have any corresponding component
- * registers as its decode is described by a fixed platform
- * description.
- */
- dev = &port->dev;
- if (parent_port)
- dev->parent = &parent_port->dev;
- else
- dev->parent = uport;
-
- port->uport = uport;
- port->component_reg_phys = component_reg_phys;
- ida_init(&port->decoder_ida);
- INIT_LIST_HEAD(&port->dports);
-
- device_initialize(dev);
- device_set_pm_not_required(dev);
- dev->bus = &cxl_bus_type;
- dev->type = &cxl_port_type;
-
- return port;
-
-err:
- kfree(port);
- return ERR_PTR(rc);
-}
-
-/**
- * devm_cxl_add_port - register a cxl_port in CXL memory decode hierarchy
- * @host: host device for devm operations
- * @uport: "physical" device implementing this upstream port
- * @component_reg_phys: (optional) for configurable cxl_port instances
- * @parent_port: next hop up in the CXL memory decode hierarchy
- */
-struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport,
- resource_size_t component_reg_phys,
- struct cxl_port *parent_port)
-{
- struct cxl_port *port;
- struct device *dev;
- int rc;
-
- port = cxl_port_alloc(uport, component_reg_phys, parent_port);
- if (IS_ERR(port))
- return port;
-
- dev = &port->dev;
- if (parent_port)
- rc = dev_set_name(dev, "port%d", port->id);
- else
- rc = dev_set_name(dev, "root%d", port->id);
- if (rc)
- goto err;
-
- rc = device_add(dev);
- if (rc)
- goto err;
-
- rc = devm_add_action_or_reset(host, unregister_port, port);
- if (rc)
- return ERR_PTR(rc);
-
- rc = devm_cxl_link_uport(host, port);
- if (rc)
- return ERR_PTR(rc);
-
- return port;
-
-err:
- put_device(dev);
- return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(devm_cxl_add_port);
-
-static struct cxl_dport *find_dport(struct cxl_port *port, int id)
-{
- struct cxl_dport *dport;
-
- device_lock_assert(&port->dev);
- list_for_each_entry (dport, &port->dports, list)
- if (dport->port_id == id)
- return dport;
- return NULL;
-}
-
-static int add_dport(struct cxl_port *port, struct cxl_dport *new)
-{
- struct cxl_dport *dup;
-
- device_lock(&port->dev);
- dup = find_dport(port, new->port_id);
- if (dup)
- dev_err(&port->dev,
- "unable to add dport%d-%s non-unique port id (%s)\n",
- new->port_id, dev_name(new->dport),
- dev_name(dup->dport));
- else
- list_add_tail(&new->list, &port->dports);
- device_unlock(&port->dev);
-
- return dup ? -EEXIST : 0;
-}
-
-/**
- * cxl_add_dport - append downstream port data to a cxl_port
- * @port: the cxl_port that references this dport
- * @dport_dev: firmware or PCI device representing the dport
- * @port_id: identifier for this dport in a decoder's target list
- * @component_reg_phys: optional location of CXL component registers
- *
- * Note that all allocations and links are undone by cxl_port deletion
- * and release.
- */
-int cxl_add_dport(struct cxl_port *port, struct device *dport_dev, int port_id,
- resource_size_t component_reg_phys)
-{
- char link_name[CXL_TARGET_STRLEN];
- struct cxl_dport *dport;
- int rc;
-
- if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >=
- CXL_TARGET_STRLEN)
- return -EINVAL;
-
- dport = kzalloc(sizeof(*dport), GFP_KERNEL);
- if (!dport)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&dport->list);
- dport->dport = get_device(dport_dev);
- dport->port_id = port_id;
- dport->component_reg_phys = component_reg_phys;
- dport->port = port;
-
- rc = add_dport(port, dport);
- if (rc)
- goto err;
-
- rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name);
- if (rc)
- goto err;
-
- return 0;
-err:
- cxl_dport_release(dport);
- return rc;
-}
-EXPORT_SYMBOL_GPL(cxl_add_dport);
-
-static struct cxl_decoder *
-cxl_decoder_alloc(struct cxl_port *port, int nr_targets, resource_size_t base,
- resource_size_t len, int interleave_ways,
- int interleave_granularity, enum cxl_decoder_type type,
- unsigned long flags)
-{
- struct cxl_decoder *cxld;
- struct device *dev;
- int rc = 0;
-
- if (interleave_ways < 1)
- return ERR_PTR(-EINVAL);
-
- device_lock(&port->dev);
- if (list_empty(&port->dports))
- rc = -EINVAL;
- device_unlock(&port->dev);
- if (rc)
- return ERR_PTR(rc);
-
- cxld = kzalloc(struct_size(cxld, target, nr_targets), GFP_KERNEL);
- if (!cxld)
- return ERR_PTR(-ENOMEM);
-
- rc = ida_alloc(&port->decoder_ida, GFP_KERNEL);
- if (rc < 0)
- goto err;
-
- *cxld = (struct cxl_decoder) {
- .id = rc,
- .range = {
- .start = base,
- .end = base + len - 1,
- },
- .flags = flags,
- .interleave_ways = interleave_ways,
- .interleave_granularity = interleave_granularity,
- .target_type = type,
- };
-
- /* handle implied target_list */
- if (interleave_ways == 1)
- cxld->target[0] =
- list_first_entry(&port->dports, struct cxl_dport, list);
- dev = &cxld->dev;
- device_initialize(dev);
- device_set_pm_not_required(dev);
- dev->parent = &port->dev;
- dev->bus = &cxl_bus_type;
-
- /* root ports do not have a cxl_port_type parent */
- if (port->dev.parent->type == &cxl_port_type)
- dev->type = &cxl_decoder_switch_type;
- else
- dev->type = &cxl_decoder_root_type;
-
- return cxld;
-err:
- kfree(cxld);
- return ERR_PTR(rc);
-}
-
-static void unregister_dev(void *dev)
-{
- device_unregister(dev);
-}
-
-struct cxl_decoder *
-devm_cxl_add_decoder(struct device *host, struct cxl_port *port, int nr_targets,
- resource_size_t base, resource_size_t len,
- int interleave_ways, int interleave_granularity,
- enum cxl_decoder_type type, unsigned long flags)
-{
- struct cxl_decoder *cxld;
- struct device *dev;
- int rc;
-
- cxld = cxl_decoder_alloc(port, nr_targets, base, len, interleave_ways,
- interleave_granularity, type, flags);
- if (IS_ERR(cxld))
- return cxld;
-
- dev = &cxld->dev;
- rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id);
- if (rc)
- goto err;
-
- rc = device_add(dev);
- if (rc)
- goto err;
-
- rc = devm_add_action_or_reset(host, unregister_dev, dev);
- if (rc)
- return ERR_PTR(rc);
- return cxld;
-
-err:
- put_device(dev);
- return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(devm_cxl_add_decoder);
-
-/**
- * cxl_probe_component_regs() - Detect CXL Component register blocks
- * @dev: Host device of the @base mapping
- * @base: Mapping containing the HDM Decoder Capability Header
- * @map: Map object describing the register block information found
- *
- * See CXL 2.0 8.2.4 Component Register Layout and Definition
- * See CXL 2.0 8.2.5.5 CXL Device Register Interface
- *
- * Probe for component register information and return it in map object.
- */
-void cxl_probe_component_regs(struct device *dev, void __iomem *base,
- struct cxl_component_reg_map *map)
-{
- int cap, cap_count;
- u64 cap_array;
-
- *map = (struct cxl_component_reg_map) { 0 };
-
- /*
- * CXL.cache and CXL.mem registers are at offset 0x1000 as defined in
- * CXL 2.0 8.2.4 Table 141.
- */
- base += CXL_CM_OFFSET;
-
- cap_array = readq(base + CXL_CM_CAP_HDR_OFFSET);
-
- if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) {
- dev_err(dev,
- "Couldn't locate the CXL.cache and CXL.mem capability array header./n");
- return;
- }
-
- /* It's assumed that future versions will be backward compatible */
- cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_array);
-
- for (cap = 1; cap <= cap_count; cap++) {
- void __iomem *register_block;
- u32 hdr;
- int decoder_cnt;
- u16 cap_id, offset;
- u32 length;
-
- hdr = readl(base + cap * 0x4);
-
- cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr);
- offset = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr);
- register_block = base + offset;
-
- switch (cap_id) {
- case CXL_CM_CAP_CAP_ID_HDM:
- dev_dbg(dev, "found HDM decoder capability (0x%x)\n",
- offset);
-
- hdr = readl(register_block);
-
- decoder_cnt = cxl_hdm_decoder_count(hdr);
- length = 0x20 * decoder_cnt + 0x10;
-
- map->hdm_decoder.valid = true;
- map->hdm_decoder.offset = CXL_CM_OFFSET + offset;
- map->hdm_decoder.size = length;
- break;
- default:
- dev_dbg(dev, "Unknown CM cap ID: %d (0x%x)\n", cap_id,
- offset);
- break;
- }
- }
-}
-EXPORT_SYMBOL_GPL(cxl_probe_component_regs);
-
-static void cxl_nvdimm_bridge_release(struct device *dev)
-{
- struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
-
- kfree(cxl_nvb);
-}
-
-static const struct attribute_group *cxl_nvdimm_bridge_attribute_groups[] = {
- &cxl_base_attribute_group,
- NULL,
-};
-
-static const struct device_type cxl_nvdimm_bridge_type = {
- .name = "cxl_nvdimm_bridge",
- .release = cxl_nvdimm_bridge_release,
- .groups = cxl_nvdimm_bridge_attribute_groups,
-};
-
-struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev)
-{
- if (dev_WARN_ONCE(dev, dev->type != &cxl_nvdimm_bridge_type,
- "not a cxl_nvdimm_bridge device\n"))
- return NULL;
- return container_of(dev, struct cxl_nvdimm_bridge, dev);
-}
-EXPORT_SYMBOL_GPL(to_cxl_nvdimm_bridge);
-
-static struct cxl_nvdimm_bridge *
-cxl_nvdimm_bridge_alloc(struct cxl_port *port)
-{
- struct cxl_nvdimm_bridge *cxl_nvb;
- struct device *dev;
-
- cxl_nvb = kzalloc(sizeof(*cxl_nvb), GFP_KERNEL);
- if (!cxl_nvb)
- return ERR_PTR(-ENOMEM);
-
- dev = &cxl_nvb->dev;
- cxl_nvb->port = port;
- cxl_nvb->state = CXL_NVB_NEW;
- device_initialize(dev);
- device_set_pm_not_required(dev);
- dev->parent = &port->dev;
- dev->bus = &cxl_bus_type;
- dev->type = &cxl_nvdimm_bridge_type;
-
- return cxl_nvb;
-}
-
-static void unregister_nvb(void *_cxl_nvb)
-{
- struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
- bool flush;
-
- /*
- * If the bridge was ever activated then there might be in-flight state
- * work to flush. Once the state has been changed to 'dead' then no new
- * work can be queued by user-triggered bind.
- */
- device_lock(&cxl_nvb->dev);
- flush = cxl_nvb->state != CXL_NVB_NEW;
- cxl_nvb->state = CXL_NVB_DEAD;
- device_unlock(&cxl_nvb->dev);
-
- /*
- * Even though the device core will trigger device_release_driver()
- * before the unregister, it does not know about the fact that
- * cxl_nvdimm_bridge_driver defers ->remove() work. So, do the driver
- * release not and flush it before tearing down the nvdimm device
- * hierarchy.
- */
- device_release_driver(&cxl_nvb->dev);
- if (flush)
- flush_work(&cxl_nvb->state_work);
- device_unregister(&cxl_nvb->dev);
-}
-
-struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
- struct cxl_port *port)
-{
- struct cxl_nvdimm_bridge *cxl_nvb;
- struct device *dev;
- int rc;
-
- if (!IS_ENABLED(CONFIG_CXL_PMEM))
- return ERR_PTR(-ENXIO);
-
- cxl_nvb = cxl_nvdimm_bridge_alloc(port);
- if (IS_ERR(cxl_nvb))
- return cxl_nvb;
-
- dev = &cxl_nvb->dev;
- rc = dev_set_name(dev, "nvdimm-bridge");
- if (rc)
- goto err;
-
- rc = device_add(dev);
- if (rc)
- goto err;
-
- rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb);
- if (rc)
- return ERR_PTR(rc);
-
- return cxl_nvb;
-
-err:
- put_device(dev);
- return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(devm_cxl_add_nvdimm_bridge);
-
-static void cxl_nvdimm_release(struct device *dev)
-{
- struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
-
- kfree(cxl_nvd);
-}
-
-static const struct attribute_group *cxl_nvdimm_attribute_groups[] = {
- &cxl_base_attribute_group,
- NULL,
-};
-
-static const struct device_type cxl_nvdimm_type = {
- .name = "cxl_nvdimm",
- .release = cxl_nvdimm_release,
- .groups = cxl_nvdimm_attribute_groups,
-};
-
-bool is_cxl_nvdimm(struct device *dev)
-{
- return dev->type == &cxl_nvdimm_type;
-}
-EXPORT_SYMBOL_GPL(is_cxl_nvdimm);
-
-struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev)
-{
- if (dev_WARN_ONCE(dev, !is_cxl_nvdimm(dev),
- "not a cxl_nvdimm device\n"))
- return NULL;
- return container_of(dev, struct cxl_nvdimm, dev);
-}
-EXPORT_SYMBOL_GPL(to_cxl_nvdimm);
-
-static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd)
-{
- struct cxl_nvdimm *cxl_nvd;
- struct device *dev;
-
- cxl_nvd = kzalloc(sizeof(*cxl_nvd), GFP_KERNEL);
- if (!cxl_nvd)
- return ERR_PTR(-ENOMEM);
-
- dev = &cxl_nvd->dev;
- cxl_nvd->cxlmd = cxlmd;
- device_initialize(dev);
- device_set_pm_not_required(dev);
- dev->parent = &cxlmd->dev;
- dev->bus = &cxl_bus_type;
- dev->type = &cxl_nvdimm_type;
-
- return cxl_nvd;
-}
-
-int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd)
-{
- struct cxl_nvdimm *cxl_nvd;
- struct device *dev;
- int rc;
-
- cxl_nvd = cxl_nvdimm_alloc(cxlmd);
- if (IS_ERR(cxl_nvd))
- return PTR_ERR(cxl_nvd);
-
- dev = &cxl_nvd->dev;
- rc = dev_set_name(dev, "pmem%d", cxlmd->id);
- if (rc)
- goto err;
-
- rc = device_add(dev);
- if (rc)
- goto err;
-
- dev_dbg(host, "%s: register %s\n", dev_name(dev->parent),
- dev_name(dev));
-
- return devm_add_action_or_reset(host, unregister_dev, dev);
-
-err:
- put_device(dev);
- return rc;
-}
-EXPORT_SYMBOL_GPL(devm_cxl_add_nvdimm);
-
-/**
- * cxl_probe_device_regs() - Detect CXL Device register blocks
- * @dev: Host device of the @base mapping
- * @base: Mapping of CXL 2.0 8.2.8 CXL Device Register Interface
- * @map: Map object describing the register block information found
- *
- * Probe for device register information and return it in map object.
- */
-void cxl_probe_device_regs(struct device *dev, void __iomem *base,
- struct cxl_device_reg_map *map)
-{
- int cap, cap_count;
- u64 cap_array;
-
- *map = (struct cxl_device_reg_map){ 0 };
-
- cap_array = readq(base + CXLDEV_CAP_ARRAY_OFFSET);
- if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) !=
- CXLDEV_CAP_ARRAY_CAP_ID)
- return;
-
- cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array);
-
- for (cap = 1; cap <= cap_count; cap++) {
- u32 offset, length;
- u16 cap_id;
-
- cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK,
- readl(base + cap * 0x10));
- offset = readl(base + cap * 0x10 + 0x4);
- length = readl(base + cap * 0x10 + 0x8);
-
- switch (cap_id) {
- case CXLDEV_CAP_CAP_ID_DEVICE_STATUS:
- dev_dbg(dev, "found Status capability (0x%x)\n", offset);
-
- map->status.valid = true;
- map->status.offset = offset;
- map->status.size = length;
- break;
- case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX:
- dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset);
- map->mbox.valid = true;
- map->mbox.offset = offset;
- map->mbox.size = length;
- break;
- case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX:
- dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset);
- break;
- case CXLDEV_CAP_CAP_ID_MEMDEV:
- dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset);
- map->memdev.valid = true;
- map->memdev.offset = offset;
- map->memdev.size = length;
- break;
- default:
- if (cap_id >= 0x8000)
- dev_dbg(dev, "Vendor cap ID: %#x offset: %#x\n", cap_id, offset);
- else
- dev_dbg(dev, "Unknown cap ID: %#x offset: %#x\n", cap_id, offset);
- break;
- }
- }
-}
-EXPORT_SYMBOL_GPL(cxl_probe_device_regs);
-
-static void __iomem *devm_cxl_iomap_block(struct device *dev,
- resource_size_t addr,
- resource_size_t length)
-{
- void __iomem *ret_val;
- struct resource *res;
-
- res = devm_request_mem_region(dev, addr, length, dev_name(dev));
- if (!res) {
- resource_size_t end = addr + length - 1;
-
- dev_err(dev, "Failed to request region %pa-%pa\n", &addr, &end);
- return NULL;
- }
-
- ret_val = devm_ioremap(dev, addr, length);
- if (!ret_val)
- dev_err(dev, "Failed to map region %pr\n", res);
-
- return ret_val;
-}
-
-int cxl_map_component_regs(struct pci_dev *pdev,
- struct cxl_component_regs *regs,
- struct cxl_register_map *map)
-{
- struct device *dev = &pdev->dev;
- resource_size_t phys_addr;
- resource_size_t length;
-
- phys_addr = pci_resource_start(pdev, map->barno);
- phys_addr += map->block_offset;
-
- phys_addr += map->component_map.hdm_decoder.offset;
- length = map->component_map.hdm_decoder.size;
- regs->hdm_decoder = devm_cxl_iomap_block(dev, phys_addr, length);
- if (!regs->hdm_decoder)
- return -ENOMEM;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(cxl_map_component_regs);
-
-int cxl_map_device_regs(struct pci_dev *pdev,
- struct cxl_device_regs *regs,
- struct cxl_register_map *map)
-{
- struct device *dev = &pdev->dev;
- resource_size_t phys_addr;
-
- phys_addr = pci_resource_start(pdev, map->barno);
- phys_addr += map->block_offset;
-
- if (map->device_map.status.valid) {
- resource_size_t addr;
- resource_size_t length;
-
- addr = phys_addr + map->device_map.status.offset;
- length = map->device_map.status.size;
- regs->status = devm_cxl_iomap_block(dev, addr, length);
- if (!regs->status)
- return -ENOMEM;
- }
-
- if (map->device_map.mbox.valid) {
- resource_size_t addr;
- resource_size_t length;
-
- addr = phys_addr + map->device_map.mbox.offset;
- length = map->device_map.mbox.size;
- regs->mbox = devm_cxl_iomap_block(dev, addr, length);
- if (!regs->mbox)
- return -ENOMEM;
- }
-
- if (map->device_map.memdev.valid) {
- resource_size_t addr;
- resource_size_t length;
-
- addr = phys_addr + map->device_map.memdev.offset;
- length = map->device_map.memdev.size;
- regs->memdev = devm_cxl_iomap_block(dev, addr, length);
- if (!regs->memdev)
- return -ENOMEM;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(cxl_map_device_regs);
-
-/**
- * __cxl_driver_register - register a driver for the cxl bus
- * @cxl_drv: cxl driver structure to attach
- * @owner: owning module/driver
- * @modname: KBUILD_MODNAME for parent driver
- */
-int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner,
- const char *modname)
-{
- if (!cxl_drv->probe) {
- pr_debug("%s ->probe() must be specified\n", modname);
- return -EINVAL;
- }
-
- if (!cxl_drv->name) {
- pr_debug("%s ->name must be specified\n", modname);
- return -EINVAL;
- }
-
- if (!cxl_drv->id) {
- pr_debug("%s ->id must be specified\n", modname);
- return -EINVAL;
- }
-
- cxl_drv->drv.bus = &cxl_bus_type;
- cxl_drv->drv.owner = owner;
- cxl_drv->drv.mod_name = modname;
- cxl_drv->drv.name = cxl_drv->name;
-
- return driver_register(&cxl_drv->drv);
-}
-EXPORT_SYMBOL_GPL(__cxl_driver_register);
-
-void cxl_driver_unregister(struct cxl_driver *cxl_drv)
-{
- driver_unregister(&cxl_drv->drv);
-}
-EXPORT_SYMBOL_GPL(cxl_driver_unregister);
-
-static int cxl_device_id(struct device *dev)
-{
- if (dev->type == &cxl_nvdimm_bridge_type)
- return CXL_DEVICE_NVDIMM_BRIDGE;
- if (dev->type == &cxl_nvdimm_type)
- return CXL_DEVICE_NVDIMM;
- return 0;
-}
-
-static int cxl_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
- return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT,
- cxl_device_id(dev));
-}
-
-static int cxl_bus_match(struct device *dev, struct device_driver *drv)
-{
- return cxl_device_id(dev) == to_cxl_drv(drv)->id;
-}
-
-static int cxl_bus_probe(struct device *dev)
-{
- return to_cxl_drv(dev->driver)->probe(dev);
-}
-
-static int cxl_bus_remove(struct device *dev)
-{
- struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver);
-
- if (cxl_drv->remove)
- cxl_drv->remove(dev);
- return 0;
-}
-
-struct bus_type cxl_bus_type = {
- .name = "cxl",
- .uevent = cxl_bus_uevent,
- .match = cxl_bus_match,
- .probe = cxl_bus_probe,
- .remove = cxl_bus_remove,
-};
-EXPORT_SYMBOL_GPL(cxl_bus_type);
-
-static __init int cxl_core_init(void)
-{
- return bus_register(&cxl_bus_type);
-}
-
-static void cxl_core_exit(void)
-{
- bus_unregister(&cxl_bus_type);
-}
-
-module_init(cxl_core_init);
-module_exit(cxl_core_exit);
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
new file mode 100644
index 000000000000..5ad8fef210b5
--- /dev/null
+++ b/drivers/cxl/core/Makefile
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_CXL_BUS) += cxl_core.o
+obj-$(CONFIG_CXL_SUSPEND) += suspend.o
+
+ccflags-y += -I$(srctree)/drivers/cxl
+CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src)
+
+cxl_core-y := port.o
+cxl_core-y += pmem.o
+cxl_core-y += regs.o
+cxl_core-y += memdev.o
+cxl_core-y += mbox.o
+cxl_core-y += pci.o
+cxl_core-y += hdm.o
+cxl_core-y += pmu.o
+cxl_core-y += cdat.o
+cxl_core-y += ras.o
+cxl_core-$(CONFIG_TRACING) += trace.o
+cxl_core-$(CONFIG_CXL_REGION) += region.o
+cxl_core-$(CONFIG_CXL_MCE) += mce.o
+cxl_core-$(CONFIG_CXL_FEATURES) += features.o
+cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
new file mode 100644
index 000000000000..7120b5f2e31f
--- /dev/null
+++ b/drivers/cxl/core/cdat.c
@@ -0,0 +1,1074 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. All rights reserved. */
+#include <linux/acpi.h>
+#include <linux/xarray.h>
+#include <linux/fw_table.h>
+#include <linux/node.h>
+#include <linux/overflow.h>
+#include "cxlpci.h"
+#include "cxlmem.h"
+#include "core.h"
+#include "cxl.h"
+
+struct dsmas_entry {
+ struct range dpa_range;
+ u8 handle;
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX];
+ int entries;
+ int qos_class;
+};
+
+static u32 cdat_normalize(u16 entry, u64 base, u8 type)
+{
+ u32 value;
+
+ /*
+ * Check for invalid and overflow values
+ */
+ if (entry == 0xffff || !entry)
+ return 0;
+ if (base > (UINT_MAX / (entry)))
+ return 0;
+
+ /*
+ * CDAT fields follow the format of HMAT fields. See table 5 Device
+ * Scoped Latency and Bandwidth Information Structure in Coherent Device
+ * Attribute Table (CDAT) Specification v1.01.
+ */
+ value = entry * base;
+ switch (type) {
+ case ACPI_HMAT_ACCESS_LATENCY:
+ case ACPI_HMAT_READ_LATENCY:
+ case ACPI_HMAT_WRITE_LATENCY:
+ value = DIV_ROUND_UP(value, 1000);
+ break;
+ default:
+ break;
+ }
+ return value;
+}
+
+static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
+{
+ struct acpi_cdat_header *hdr = &header->cdat;
+ struct acpi_cdat_dsmas *dsmas;
+ int size = sizeof(*hdr) + sizeof(*dsmas);
+ struct xarray *dsmas_xa = arg;
+ struct dsmas_entry *dent;
+ u16 len;
+ int rc;
+
+ len = le16_to_cpu((__force __le16)hdr->length);
+ if (len != size || (unsigned long)hdr + len > end) {
+ pr_warn("Malformed DSMAS table length: (%u:%u)\n", size, len);
+ return -EINVAL;
+ }
+
+ /* Skip common header */
+ dsmas = (struct acpi_cdat_dsmas *)(hdr + 1);
+
+ dent = kzalloc(sizeof(*dent), GFP_KERNEL);
+ if (!dent)
+ return -ENOMEM;
+
+ dent->handle = dsmas->dsmad_handle;
+ dent->dpa_range.start = le64_to_cpu((__force __le64)dsmas->dpa_base_address);
+ dent->dpa_range.end = le64_to_cpu((__force __le64)dsmas->dpa_base_address) +
+ le64_to_cpu((__force __le64)dsmas->dpa_length) - 1;
+
+ rc = xa_insert(dsmas_xa, dent->handle, dent, GFP_KERNEL);
+ if (rc) {
+ kfree(dent);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void __cxl_access_coordinate_set(struct access_coordinate *coord,
+ int access, unsigned int val)
+{
+ switch (access) {
+ case ACPI_HMAT_ACCESS_LATENCY:
+ coord->read_latency = val;
+ coord->write_latency = val;
+ break;
+ case ACPI_HMAT_READ_LATENCY:
+ coord->read_latency = val;
+ break;
+ case ACPI_HMAT_WRITE_LATENCY:
+ coord->write_latency = val;
+ break;
+ case ACPI_HMAT_ACCESS_BANDWIDTH:
+ coord->read_bandwidth = val;
+ coord->write_bandwidth = val;
+ break;
+ case ACPI_HMAT_READ_BANDWIDTH:
+ coord->read_bandwidth = val;
+ break;
+ case ACPI_HMAT_WRITE_BANDWIDTH:
+ coord->write_bandwidth = val;
+ break;
+ }
+}
+
+static void cxl_access_coordinate_set(struct access_coordinate *coord,
+ int access, unsigned int val)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++)
+ __cxl_access_coordinate_set(&coord[i], access, val);
+}
+
+static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
+{
+ struct acpi_cdat_header *hdr = &header->cdat;
+ struct acpi_cdat_dslbis *dslbis;
+ int size = sizeof(*hdr) + sizeof(*dslbis);
+ struct xarray *dsmas_xa = arg;
+ struct dsmas_entry *dent;
+ __le64 le_base;
+ __le16 le_val;
+ u64 val;
+ u16 len;
+
+ len = le16_to_cpu((__force __le16)hdr->length);
+ if (len != size || (unsigned long)hdr + len > end) {
+ pr_warn("Malformed DSLBIS table length: (%u:%u)\n", size, len);
+ return -EINVAL;
+ }
+
+ /* Skip common header */
+ dslbis = (struct acpi_cdat_dslbis *)(hdr + 1);
+
+ /* Skip unrecognized data type */
+ if (dslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH)
+ return 0;
+
+ /* Not a memory type, skip */
+ if ((dslbis->flags & ACPI_HMAT_MEMORY_HIERARCHY) != ACPI_HMAT_MEMORY)
+ return 0;
+
+ dent = xa_load(dsmas_xa, dslbis->handle);
+ if (!dent) {
+ pr_warn("No matching DSMAS entry for DSLBIS entry.\n");
+ return 0;
+ }
+
+ le_base = (__force __le64)dslbis->entry_base_unit;
+ le_val = (__force __le16)dslbis->entry[0];
+ val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base),
+ dslbis->data_type);
+
+ cxl_access_coordinate_set(dent->cdat_coord, dslbis->data_type, val);
+
+ return 0;
+}
+
+static int cdat_table_parse_output(int rc)
+{
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+static int cxl_cdat_endpoint_process(struct cxl_port *port,
+ struct xarray *dsmas_xa)
+{
+ int rc;
+
+ rc = cdat_table_parse(ACPI_CDAT_TYPE_DSMAS, cdat_dsmas_handler,
+ dsmas_xa, port->cdat.table, port->cdat.length);
+ rc = cdat_table_parse_output(rc);
+ if (rc)
+ return rc;
+
+ rc = cdat_table_parse(ACPI_CDAT_TYPE_DSLBIS, cdat_dslbis_handler,
+ dsmas_xa, port->cdat.table, port->cdat.length);
+ return cdat_table_parse_output(rc);
+}
+
+static int cxl_port_perf_data_calculate(struct cxl_port *port,
+ struct xarray *dsmas_xa)
+{
+ struct access_coordinate ep_c[ACCESS_COORDINATE_MAX];
+ struct dsmas_entry *dent;
+ int valid_entries = 0;
+ unsigned long index;
+ int rc;
+
+ rc = cxl_endpoint_get_perf_coordinates(port, ep_c);
+ if (rc) {
+ dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n");
+ return rc;
+ }
+
+ struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
+
+ if (!cxl_root)
+ return -ENODEV;
+
+ if (!cxl_root->ops || !cxl_root->ops->qos_class)
+ return -EOPNOTSUPP;
+
+ xa_for_each(dsmas_xa, index, dent) {
+ int qos_class;
+
+ cxl_coordinates_combine(dent->coord, dent->cdat_coord, ep_c);
+ dent->entries = 1;
+ rc = cxl_root->ops->qos_class(cxl_root,
+ &dent->coord[ACCESS_COORDINATE_CPU],
+ 1, &qos_class);
+ if (rc != 1)
+ continue;
+
+ valid_entries++;
+ dent->qos_class = qos_class;
+ }
+
+ if (!valid_entries)
+ return -ENOENT;
+
+ return 0;
+}
+
+static void update_perf_entry(struct device *dev, struct dsmas_entry *dent,
+ struct cxl_dpa_perf *dpa_perf)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ dpa_perf->coord[i] = dent->coord[i];
+ dpa_perf->cdat_coord[i] = dent->cdat_coord[i];
+ }
+ dpa_perf->dpa_range = dent->dpa_range;
+ dpa_perf->qos_class = dent->qos_class;
+ dev_dbg(dev,
+ "DSMAS: dpa: %pra qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n",
+ &dent->dpa_range, dpa_perf->qos_class,
+ dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth,
+ dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth,
+ dent->coord[ACCESS_COORDINATE_CPU].read_latency,
+ dent->coord[ACCESS_COORDINATE_CPU].write_latency);
+}
+
+static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds,
+ struct xarray *dsmas_xa)
+{
+ struct device *dev = cxlds->dev;
+ struct dsmas_entry *dent;
+ unsigned long index;
+
+ xa_for_each(dsmas_xa, index, dent) {
+ bool found = false;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct resource *res = &cxlds->part[i].res;
+ struct range range = {
+ .start = res->start,
+ .end = res->end,
+ };
+
+ if (range_contains(&range, &dent->dpa_range)) {
+ update_perf_entry(dev, dent,
+ &cxlds->part[i].perf);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ dev_dbg(dev, "no partition for dsmas dpa: %pra\n",
+ &dent->dpa_range);
+ }
+}
+
+static int match_cxlrd_qos_class(struct device *dev, void *data)
+{
+ int dev_qos_class = *(int *)data;
+ struct cxl_root_decoder *cxlrd;
+
+ if (!is_root_decoder(dev))
+ return 0;
+
+ cxlrd = to_cxl_root_decoder(dev);
+ if (cxlrd->qos_class == CXL_QOS_CLASS_INVALID)
+ return 0;
+
+ if (cxlrd->qos_class == dev_qos_class)
+ return 1;
+
+ return 0;
+}
+
+static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf)
+{
+ *dpa_perf = (struct cxl_dpa_perf) {
+ .qos_class = CXL_QOS_CLASS_INVALID,
+ };
+}
+
+static bool cxl_qos_match(struct cxl_port *root_port,
+ struct cxl_dpa_perf *dpa_perf)
+{
+ if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID)
+ return false;
+
+ if (!device_for_each_child(&root_port->dev, &dpa_perf->qos_class,
+ match_cxlrd_qos_class))
+ return false;
+
+ return true;
+}
+
+static int match_cxlrd_hb(struct device *dev, void *data)
+{
+ struct device *host_bridge = data;
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_root_decoder *cxlrd;
+
+ if (!is_root_decoder(dev))
+ return 0;
+
+ cxlrd = to_cxl_root_decoder(dev);
+ cxlsd = &cxlrd->cxlsd;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ for (int i = 0; i < cxlsd->nr_targets; i++) {
+ if (cxlsd->target[i] && host_bridge == cxlsd->target[i]->dport_dev)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void cxl_qos_class_verify(struct cxl_memdev *cxlmd)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_port *root_port;
+
+ struct cxl_root *cxl_root __free(put_cxl_root) =
+ find_cxl_root(cxlmd->endpoint);
+
+ /*
+ * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to
+ * succeed when called in the cxl_endpoint_port_probe() path.
+ */
+ if (!cxl_root)
+ return;
+
+ root_port = &cxl_root->port;
+
+ /*
+ * Save userspace from needing to check if a qos class has any matches
+ * by hiding qos class info if the memdev is not mapped by a root
+ * decoder, or the partition class does not match any root decoder
+ * class.
+ */
+ if (!device_for_each_child(&root_port->dev,
+ cxlmd->endpoint->host_bridge,
+ match_cxlrd_hb)) {
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ reset_dpa_perf(perf);
+ }
+ return;
+ }
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ struct cxl_dpa_perf *perf = &cxlds->part[i].perf;
+
+ if (!cxl_qos_match(root_port, perf))
+ reset_dpa_perf(perf);
+ }
+}
+
+static void discard_dsmas(struct xarray *xa)
+{
+ unsigned long index;
+ void *ent;
+
+ xa_for_each(xa, index, ent) {
+ xa_erase(xa, index);
+ kfree(ent);
+ }
+ xa_destroy(xa);
+}
+DEFINE_FREE(dsmas, struct xarray *, if (_T) discard_dsmas(_T))
+
+void cxl_endpoint_parse_cdat(struct cxl_port *port)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct xarray __dsmas_xa;
+ struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa;
+ int rc;
+
+ xa_init(&__dsmas_xa);
+ if (!port->cdat.table)
+ return;
+
+ rc = cxl_cdat_endpoint_process(port, dsmas_xa);
+ if (rc < 0) {
+ dev_dbg(&port->dev, "Failed to parse CDAT: %d\n", rc);
+ return;
+ }
+
+ rc = cxl_port_perf_data_calculate(port, dsmas_xa);
+ if (rc) {
+ dev_dbg(&port->dev, "Failed to do perf coord calculations.\n");
+ return;
+ }
+
+ cxl_memdev_set_qos_class(cxlds, dsmas_xa);
+ cxl_qos_class_verify(cxlmd);
+ cxl_memdev_update_perf(cxlmd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, "CXL");
+
+static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg,
+ const unsigned long end)
+{
+ struct acpi_cdat_sslbis_table {
+ struct acpi_cdat_header header;
+ struct acpi_cdat_sslbis sslbis_header;
+ struct acpi_cdat_sslbe entries[];
+ } *tbl = (struct acpi_cdat_sslbis_table *)header;
+ int size = sizeof(header->cdat) + sizeof(tbl->sslbis_header);
+ struct acpi_cdat_sslbis *sslbis;
+ struct cxl_dport *dport = arg;
+ struct device *dev = &dport->port->dev;
+ int remain, entries, i;
+ u16 len;
+
+ len = le16_to_cpu((__force __le16)header->cdat.length);
+ remain = len - size;
+ if (!remain || remain % sizeof(tbl->entries[0]) ||
+ (unsigned long)header + len > end) {
+ dev_warn(dev, "Malformed SSLBIS table length: (%u)\n", len);
+ return -EINVAL;
+ }
+
+ sslbis = &tbl->sslbis_header;
+ /* Unrecognized data type, we can skip */
+ if (sslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH)
+ return 0;
+
+ entries = remain / sizeof(tbl->entries[0]);
+ if (struct_size(tbl, entries, entries) != len)
+ return -EINVAL;
+
+ for (i = 0; i < entries; i++) {
+ u16 x = le16_to_cpu((__force __le16)tbl->entries[i].portx_id);
+ u16 y = le16_to_cpu((__force __le16)tbl->entries[i].porty_id);
+ __le64 le_base;
+ __le16 le_val;
+ u16 dsp_id;
+ u64 val;
+
+ switch (x) {
+ case ACPI_CDAT_SSLBIS_US_PORT:
+ dsp_id = y;
+ break;
+ case ACPI_CDAT_SSLBIS_ANY_PORT:
+ switch (y) {
+ case ACPI_CDAT_SSLBIS_US_PORT:
+ dsp_id = x;
+ break;
+ case ACPI_CDAT_SSLBIS_ANY_PORT:
+ dsp_id = ACPI_CDAT_SSLBIS_ANY_PORT;
+ break;
+ default:
+ dsp_id = y;
+ break;
+ }
+ break;
+ default:
+ dsp_id = x;
+ break;
+ }
+
+ le_base = (__force __le64)tbl->sslbis_header.entry_base_unit;
+ le_val = (__force __le16)tbl->entries[i].latency_or_bandwidth;
+ val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base),
+ sslbis->data_type);
+
+ if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT ||
+ dsp_id == dport->port_id) {
+ cxl_access_coordinate_set(dport->coord,
+ sslbis->data_type, val);
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+void cxl_switch_parse_cdat(struct cxl_dport *dport)
+{
+ struct cxl_port *port = dport->port;
+ int rc;
+
+ if (!port->cdat.table)
+ return;
+
+ rc = cdat_table_parse(ACPI_CDAT_TYPE_SSLBIS, cdat_sslbis_handler,
+ dport, port->cdat.table, port->cdat.length);
+ rc = cdat_table_parse_output(rc);
+ if (rc)
+ dev_dbg(&port->dev, "Failed to parse SSLBIS: %d\n", rc);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, "CXL");
+
+static void __cxl_coordinates_combine(struct access_coordinate *out,
+ struct access_coordinate *c1,
+ struct access_coordinate *c2)
+{
+ if (c1->write_bandwidth && c2->write_bandwidth)
+ out->write_bandwidth = min(c1->write_bandwidth,
+ c2->write_bandwidth);
+ out->write_latency = c1->write_latency + c2->write_latency;
+
+ if (c1->read_bandwidth && c2->read_bandwidth)
+ out->read_bandwidth = min(c1->read_bandwidth,
+ c2->read_bandwidth);
+ out->read_latency = c1->read_latency + c2->read_latency;
+}
+
+/**
+ * cxl_coordinates_combine - Combine the two input coordinates
+ *
+ * @out: Output coordinate of c1 and c2 combined
+ * @c1: input coordinates
+ * @c2: input coordinates
+ */
+void cxl_coordinates_combine(struct access_coordinate *out,
+ struct access_coordinate *c1,
+ struct access_coordinate *c2)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++)
+ __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]);
+}
+
+MODULE_IMPORT_NS("CXL");
+
+static void cxl_bandwidth_add(struct access_coordinate *coord,
+ struct access_coordinate *c1,
+ struct access_coordinate *c2)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ coord[i].read_bandwidth = c1[i].read_bandwidth +
+ c2[i].read_bandwidth;
+ coord[i].write_bandwidth = c1[i].write_bandwidth +
+ c2[i].write_bandwidth;
+ }
+}
+
+static bool dpa_perf_contains(struct cxl_dpa_perf *perf,
+ struct resource *dpa_res)
+{
+ struct range dpa = {
+ .start = dpa_res->start,
+ .end = dpa_res->end,
+ };
+
+ return range_contains(&perf->dpa_range, &dpa);
+}
+
+static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_dpa_perf *perf;
+
+ if (cxled->part < 0)
+ return ERR_PTR(-EINVAL);
+ perf = &cxlds->part[cxled->part].perf;
+
+ if (!perf)
+ return ERR_PTR(-EINVAL);
+
+ if (!dpa_perf_contains(perf, cxled->dpa_res))
+ return ERR_PTR(-EINVAL);
+
+ return perf;
+}
+
+/*
+ * Transient context for containing the current calculation of bandwidth when
+ * doing walking the port hierarchy to deal with shared upstream link.
+ */
+struct cxl_perf_ctx {
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ struct cxl_port *port;
+};
+
+/**
+ * cxl_endpoint_gather_bandwidth - collect all the endpoint bandwidth in an xarray
+ * @cxlr: CXL region for the bandwidth calculation
+ * @cxled: endpoint decoder to start on
+ * @usp_xa: (output) the xarray that collects all the bandwidth coordinates
+ * indexed by the upstream device with data of 'struct cxl_perf_ctx'.
+ * @gp_is_root: (output) bool of whether the grandparent is cxl root.
+ *
+ * Return: 0 for success or -errno
+ *
+ * Collects aggregated endpoint bandwidth and store the bandwidth in
+ * an xarray indexed by the upstream device of the switch or the RP
+ * device. Each endpoint consists the minimum of the bandwidth from DSLBIS
+ * from the endpoint CDAT, the endpoint upstream link bandwidth, and the
+ * bandwidth from the SSLBIS of the switch CDAT for the switch upstream port to
+ * the downstream port that's associated with the endpoint. If the
+ * device is directly connected to a RP, then no SSLBIS is involved.
+ */
+static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct xarray *usp_xa,
+ bool *gp_is_root)
+{
+ struct cxl_port *endpoint = to_cxl_port(cxled->cxld.dev.parent);
+ struct cxl_port *parent_port = to_cxl_port(endpoint->dev.parent);
+ struct cxl_port *gp_port = to_cxl_port(parent_port->dev.parent);
+ struct access_coordinate pci_coord[ACCESS_COORDINATE_MAX];
+ struct access_coordinate sw_coord[ACCESS_COORDINATE_MAX];
+ struct access_coordinate ep_coord[ACCESS_COORDINATE_MAX];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ struct cxl_perf_ctx *perf_ctx;
+ struct cxl_dpa_perf *perf;
+ unsigned long index;
+ void *ptr;
+ int rc;
+
+ if (!dev_is_pci(cxlds->dev))
+ return -ENODEV;
+
+ if (cxlds->rcd)
+ return -ENODEV;
+
+ perf = cxled_get_dpa_perf(cxled);
+ if (IS_ERR(perf))
+ return PTR_ERR(perf);
+
+ *gp_is_root = is_cxl_root(gp_port);
+
+ /*
+ * If the grandparent is cxl root, then index is the root port,
+ * otherwise it's the parent switch upstream device.
+ */
+ if (*gp_is_root)
+ index = (unsigned long)endpoint->parent_dport->dport_dev;
+ else
+ index = (unsigned long)parent_port->uport_dev;
+
+ perf_ctx = xa_load(usp_xa, index);
+ if (!perf_ctx) {
+ struct cxl_perf_ctx *c __free(kfree) =
+ kzalloc(sizeof(*perf_ctx), GFP_KERNEL);
+
+ if (!c)
+ return -ENOMEM;
+ ptr = xa_store(usp_xa, index, c, GFP_KERNEL);
+ if (xa_is_err(ptr))
+ return xa_err(ptr);
+ perf_ctx = no_free_ptr(c);
+ perf_ctx->port = parent_port;
+ }
+
+ /* Direct upstream link from EP bandwidth */
+ rc = cxl_pci_get_bandwidth(pdev, pci_coord);
+ if (rc < 0)
+ return rc;
+
+ /*
+ * Min of upstream link bandwidth and Endpoint CDAT bandwidth from
+ * DSLBIS.
+ */
+ cxl_coordinates_combine(ep_coord, pci_coord, perf->cdat_coord);
+
+ /*
+ * If grandparent port is root, then there's no switch involved and
+ * the endpoint is connected to a root port.
+ */
+ if (!*gp_is_root) {
+ /*
+ * Retrieve the switch SSLBIS for switch downstream port
+ * associated with the endpoint bandwidth.
+ */
+ rc = cxl_port_get_switch_dport_bandwidth(endpoint, sw_coord);
+ if (rc)
+ return rc;
+
+ /*
+ * Min of the earlier coordinates with the switch SSLBIS
+ * bandwidth
+ */
+ cxl_coordinates_combine(ep_coord, ep_coord, sw_coord);
+ }
+
+ /*
+ * Aggregate the computed bandwidth with the current aggregated bandwidth
+ * of the endpoints with the same switch upstream device or RP.
+ */
+ cxl_bandwidth_add(perf_ctx->coord, perf_ctx->coord, ep_coord);
+
+ return 0;
+}
+
+static void free_perf_xa(struct xarray *xa)
+{
+ struct cxl_perf_ctx *ctx;
+ unsigned long index;
+
+ if (!xa)
+ return;
+
+ xa_for_each(xa, index, ctx)
+ kfree(ctx);
+ xa_destroy(xa);
+ kfree(xa);
+}
+DEFINE_FREE(free_perf_xa, struct xarray *, if (_T) free_perf_xa(_T))
+
+/**
+ * cxl_switch_gather_bandwidth - collect all the bandwidth at switch level in an xarray
+ * @cxlr: The region being operated on
+ * @input_xa: xarray indexed by upstream device of a switch with data of 'struct
+ * cxl_perf_ctx'
+ * @gp_is_root: (output) bool of whether the grandparent is cxl root.
+ *
+ * Return: a xarray of resulting cxl_perf_ctx per parent switch or root port
+ * or ERR_PTR(-errno)
+ *
+ * Iterate through the xarray. Take the minimum of the downstream calculated
+ * bandwidth, the upstream link bandwidth, and the SSLBIS of the upstream
+ * switch if exists. Sum the resulting bandwidth under the switch upstream
+ * device or a RP device. The function can be iterated over multiple switches
+ * if the switches are present.
+ */
+static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr,
+ struct xarray *input_xa,
+ bool *gp_is_root)
+{
+ struct xarray *res_xa __free(free_perf_xa) =
+ kzalloc(sizeof(*res_xa), GFP_KERNEL);
+ struct access_coordinate coords[ACCESS_COORDINATE_MAX];
+ struct cxl_perf_ctx *ctx, *us_ctx;
+ unsigned long index, us_index;
+ int dev_count = 0;
+ int gp_count = 0;
+ void *ptr;
+ int rc;
+
+ if (!res_xa)
+ return ERR_PTR(-ENOMEM);
+ xa_init(res_xa);
+
+ xa_for_each(input_xa, index, ctx) {
+ struct device *dev = (struct device *)index;
+ struct cxl_port *port = ctx->port;
+ struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+ struct cxl_port *gp_port = to_cxl_port(parent_port->dev.parent);
+ struct cxl_dport *dport = port->parent_dport;
+ bool is_root = false;
+
+ dev_count++;
+ if (is_cxl_root(gp_port)) {
+ is_root = true;
+ gp_count++;
+ }
+
+ /*
+ * If the grandparent is cxl root, then index is the root port,
+ * otherwise it's the parent switch upstream device.
+ */
+ if (is_root)
+ us_index = (unsigned long)port->parent_dport->dport_dev;
+ else
+ us_index = (unsigned long)parent_port->uport_dev;
+
+ us_ctx = xa_load(res_xa, us_index);
+ if (!us_ctx) {
+ struct cxl_perf_ctx *n __free(kfree) =
+ kzalloc(sizeof(*n), GFP_KERNEL);
+
+ if (!n)
+ return ERR_PTR(-ENOMEM);
+
+ ptr = xa_store(res_xa, us_index, n, GFP_KERNEL);
+ if (xa_is_err(ptr))
+ return ERR_PTR(xa_err(ptr));
+ us_ctx = no_free_ptr(n);
+ us_ctx->port = parent_port;
+ }
+
+ /*
+ * If the device isn't an upstream PCIe port, there's something
+ * wrong with the topology.
+ */
+ if (!dev_is_pci(dev))
+ return ERR_PTR(-EINVAL);
+
+ /* Retrieve the upstream link bandwidth */
+ rc = cxl_pci_get_bandwidth(to_pci_dev(dev), coords);
+ if (rc)
+ return ERR_PTR(-ENXIO);
+
+ /*
+ * Take the min of downstream bandwidth and the upstream link
+ * bandwidth.
+ */
+ cxl_coordinates_combine(coords, coords, ctx->coord);
+
+ /*
+ * Take the min of the calculated bandwidth and the upstream
+ * switch SSLBIS bandwidth if there's a parent switch
+ */
+ if (!is_root)
+ cxl_coordinates_combine(coords, coords, dport->coord);
+
+ /*
+ * Aggregate the calculated bandwidth common to an upstream
+ * switch.
+ */
+ cxl_bandwidth_add(us_ctx->coord, us_ctx->coord, coords);
+ }
+
+ /* Asymmetric topology detected. */
+ if (gp_count) {
+ if (gp_count != dev_count) {
+ dev_dbg(&cxlr->dev,
+ "Asymmetric hierarchy detected, bandwidth not updated\n");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ *gp_is_root = true;
+ }
+
+ return no_free_ptr(res_xa);
+}
+
+/**
+ * cxl_rp_gather_bandwidth - handle the root port level bandwidth collection
+ * @xa: the xarray that holds the cxl_perf_ctx that has the bandwidth calculated
+ * below each root port device.
+ *
+ * Return: xarray that holds cxl_perf_ctx per host bridge or ERR_PTR(-errno)
+ */
+static struct xarray *cxl_rp_gather_bandwidth(struct xarray *xa)
+{
+ struct xarray *hb_xa __free(free_perf_xa) =
+ kzalloc(sizeof(*hb_xa), GFP_KERNEL);
+ struct cxl_perf_ctx *ctx;
+ unsigned long index;
+
+ if (!hb_xa)
+ return ERR_PTR(-ENOMEM);
+ xa_init(hb_xa);
+
+ xa_for_each(xa, index, ctx) {
+ struct cxl_port *port = ctx->port;
+ unsigned long hb_index = (unsigned long)port->uport_dev;
+ struct cxl_perf_ctx *hb_ctx;
+ void *ptr;
+
+ hb_ctx = xa_load(hb_xa, hb_index);
+ if (!hb_ctx) {
+ struct cxl_perf_ctx *n __free(kfree) =
+ kzalloc(sizeof(*n), GFP_KERNEL);
+
+ if (!n)
+ return ERR_PTR(-ENOMEM);
+ ptr = xa_store(hb_xa, hb_index, n, GFP_KERNEL);
+ if (xa_is_err(ptr))
+ return ERR_PTR(xa_err(ptr));
+ hb_ctx = no_free_ptr(n);
+ hb_ctx->port = port;
+ }
+
+ cxl_bandwidth_add(hb_ctx->coord, hb_ctx->coord, ctx->coord);
+ }
+
+ return no_free_ptr(hb_xa);
+}
+
+/**
+ * cxl_hb_gather_bandwidth - handle the host bridge level bandwidth collection
+ * @xa: the xarray that holds the cxl_perf_ctx that has the bandwidth calculated
+ * below each host bridge.
+ *
+ * Return: xarray that holds cxl_perf_ctx per ACPI0017 device or ERR_PTR(-errno)
+ */
+static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa)
+{
+ struct xarray *mw_xa __free(free_perf_xa) =
+ kzalloc(sizeof(*mw_xa), GFP_KERNEL);
+ struct cxl_perf_ctx *ctx;
+ unsigned long index;
+
+ if (!mw_xa)
+ return ERR_PTR(-ENOMEM);
+ xa_init(mw_xa);
+
+ xa_for_each(xa, index, ctx) {
+ struct cxl_port *port = ctx->port;
+ struct cxl_port *parent_port;
+ struct cxl_perf_ctx *mw_ctx;
+ struct cxl_dport *dport;
+ unsigned long mw_index;
+ void *ptr;
+
+ parent_port = to_cxl_port(port->dev.parent);
+ mw_index = (unsigned long)parent_port->uport_dev;
+
+ mw_ctx = xa_load(mw_xa, mw_index);
+ if (!mw_ctx) {
+ struct cxl_perf_ctx *n __free(kfree) =
+ kzalloc(sizeof(*n), GFP_KERNEL);
+
+ if (!n)
+ return ERR_PTR(-ENOMEM);
+ ptr = xa_store(mw_xa, mw_index, n, GFP_KERNEL);
+ if (xa_is_err(ptr))
+ return ERR_PTR(xa_err(ptr));
+ mw_ctx = no_free_ptr(n);
+ }
+
+ dport = port->parent_dport;
+ cxl_coordinates_combine(ctx->coord, ctx->coord, dport->coord);
+ cxl_bandwidth_add(mw_ctx->coord, mw_ctx->coord, ctx->coord);
+ }
+
+ return no_free_ptr(mw_xa);
+}
+
+/**
+ * cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region
+ * @cxlr: The region being operated on
+ * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance
+ */
+static void cxl_region_update_bandwidth(struct cxl_region *cxlr,
+ struct xarray *input_xa)
+{
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ struct cxl_perf_ctx *ctx;
+ unsigned long index;
+
+ memset(coord, 0, sizeof(coord));
+ xa_for_each(input_xa, index, ctx)
+ cxl_bandwidth_add(coord, coord, ctx->coord);
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ cxlr->coord[i].read_bandwidth = coord[i].read_bandwidth;
+ cxlr->coord[i].write_bandwidth = coord[i].write_bandwidth;
+ }
+}
+
+/**
+ * cxl_region_shared_upstream_bandwidth_update - Recalculate the bandwidth for
+ * the region
+ * @cxlr: the cxl region to recalculate
+ *
+ * The function walks the topology from bottom up and calculates the bandwidth. It
+ * starts at the endpoints, processes at the switches if any, processes at the rootport
+ * level, at the host bridge level, and finally aggregates at the region.
+ */
+void cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
+{
+ struct xarray *working_xa;
+ int root_count = 0;
+ bool is_root;
+ int rc;
+
+ lockdep_assert_held(&cxl_rwsem.dpa);
+
+ struct xarray *usp_xa __free(free_perf_xa) =
+ kzalloc(sizeof(*usp_xa), GFP_KERNEL);
+
+ if (!usp_xa)
+ return;
+
+ xa_init(usp_xa);
+
+ /* Collect bandwidth data from all the endpoints. */
+ for (int i = 0; i < cxlr->params.nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = cxlr->params.targets[i];
+
+ is_root = false;
+ rc = cxl_endpoint_gather_bandwidth(cxlr, cxled, usp_xa, &is_root);
+ if (rc)
+ return;
+ root_count += is_root;
+ }
+
+ /* Detect asymmetric hierarchy with some direct attached endpoints. */
+ if (root_count && root_count != cxlr->params.nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "Asymmetric hierarchy detected, bandwidth not updated\n");
+ return;
+ }
+
+ /*
+ * Walk up one or more switches to deal with the bandwidth of the
+ * switches if they exist. Endpoints directly attached to RPs skip
+ * over this part.
+ */
+ if (!root_count) {
+ do {
+ working_xa = cxl_switch_gather_bandwidth(cxlr, usp_xa,
+ &is_root);
+ if (IS_ERR(working_xa))
+ return;
+ free_perf_xa(usp_xa);
+ usp_xa = working_xa;
+ } while (!is_root);
+ }
+
+ /* Handle the bandwidth at the root port of the hierarchy */
+ working_xa = cxl_rp_gather_bandwidth(usp_xa);
+ if (IS_ERR(working_xa))
+ return;
+ free_perf_xa(usp_xa);
+ usp_xa = working_xa;
+
+ /* Handle the bandwidth at the host bridge of the hierarchy */
+ working_xa = cxl_hb_gather_bandwidth(usp_xa);
+ if (IS_ERR(working_xa))
+ return;
+ free_perf_xa(usp_xa);
+ usp_xa = working_xa;
+
+ /*
+ * Aggregate all the bandwidth collected per CFMWS (ACPI0017) and
+ * update the region bandwidth with the final calculated values.
+ */
+ cxl_region_update_bandwidth(cxlr, usp_xa);
+}
+
+void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_dpa_perf *perf;
+
+ lockdep_assert_held(&cxl_rwsem.dpa);
+
+ perf = cxled_get_dpa_perf(cxled);
+ if (IS_ERR(perf))
+ return;
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ /* Get total bandwidth and the worst latency for the cxl region */
+ cxlr->coord[i].read_latency = max_t(unsigned int,
+ cxlr->coord[i].read_latency,
+ perf->coord[i].read_latency);
+ cxlr->coord[i].write_latency = max_t(unsigned int,
+ cxlr->coord[i].write_latency,
+ perf->coord[i].write_latency);
+ cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth;
+ cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth;
+ }
+}
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
new file mode 100644
index 000000000000..1fb66132b777
--- /dev/null
+++ b/drivers/cxl/core/core.h
@@ -0,0 +1,169 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. */
+
+#ifndef __CXL_CORE_H__
+#define __CXL_CORE_H__
+
+#include <cxl/mailbox.h>
+#include <linux/rwsem.h>
+
+extern const struct device_type cxl_nvdimm_bridge_type;
+extern const struct device_type cxl_nvdimm_type;
+extern const struct device_type cxl_pmu_type;
+
+extern struct attribute_group cxl_base_attribute_group;
+
+enum cxl_detach_mode {
+ DETACH_ONLY,
+ DETACH_INVALIDATE,
+};
+
+#ifdef CONFIG_CXL_REGION
+extern struct device_attribute dev_attr_create_pmem_region;
+extern struct device_attribute dev_attr_create_ram_region;
+extern struct device_attribute dev_attr_delete_region;
+extern struct device_attribute dev_attr_region;
+extern const struct device_type cxl_pmem_region_type;
+extern const struct device_type cxl_dax_region_type;
+extern const struct device_type cxl_region_type;
+
+int cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ enum cxl_detach_mode mode);
+
+#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
+#define CXL_REGION_TYPE(x) (&cxl_region_type)
+#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
+#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type)
+#define CXL_DAX_REGION_TYPE(x) (&cxl_dax_region_type)
+int cxl_region_init(void);
+void cxl_region_exit(void);
+int cxl_get_poison_by_endpoint(struct cxl_port *port);
+struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa);
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+ u64 dpa);
+
+#else
+static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr,
+ const struct cxl_memdev *cxlmd, u64 dpa)
+{
+ return ULLONG_MAX;
+}
+static inline
+struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
+{
+ return NULL;
+}
+static inline int cxl_get_poison_by_endpoint(struct cxl_port *port)
+{
+ return 0;
+}
+static inline int cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ int pos, enum cxl_detach_mode mode)
+{
+ return 0;
+}
+static inline int cxl_region_init(void)
+{
+ return 0;
+}
+static inline void cxl_region_exit(void)
+{
+}
+#define CXL_REGION_ATTR(x) NULL
+#define CXL_REGION_TYPE(x) NULL
+#define SET_CXL_REGION_ATTR(x)
+#define CXL_PMEM_REGION_TYPE(x) NULL
+#define CXL_DAX_REGION_TYPE(x) NULL
+#endif
+
+struct cxl_send_command;
+struct cxl_mem_query_commands;
+int cxl_query_cmd(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mem_query_commands __user *q);
+int cxl_send_cmd(struct cxl_mailbox *cxl_mbox, struct cxl_send_command __user *s);
+void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
+ resource_size_t length);
+
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode);
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size);
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled);
+bool cxl_resource_contains_addr(const struct resource *res, const resource_size_t addr);
+
+enum cxl_rcrb {
+ CXL_RCRB_DOWNSTREAM,
+ CXL_RCRB_UPSTREAM,
+};
+struct cxl_rcrb_info;
+resource_size_t __rcrb_to_component(struct device *dev,
+ struct cxl_rcrb_info *ri,
+ enum cxl_rcrb which);
+u16 cxl_rcrb_to_aer(struct device *dev, resource_size_t rcrb);
+
+#define PCI_RCRB_CAP_LIST_ID_MASK GENMASK(7, 0)
+#define PCI_RCRB_CAP_HDR_ID_MASK GENMASK(7, 0)
+#define PCI_RCRB_CAP_HDR_NEXT_MASK GENMASK(15, 8)
+#define PCI_CAP_EXP_SIZEOF 0x3c
+
+struct cxl_rwsem {
+ /*
+ * All changes to HPA (interleave configuration) occur with this
+ * lock held for write.
+ */
+ struct rw_semaphore region;
+ /*
+ * All changes to a device DPA space occur with this lock held
+ * for write.
+ */
+ struct rw_semaphore dpa;
+};
+
+extern struct cxl_rwsem cxl_rwsem;
+
+int cxl_memdev_init(void);
+void cxl_memdev_exit(void);
+void cxl_mbox_init(void);
+
+enum cxl_poison_trace_type {
+ CXL_POISON_TRACE_LIST,
+ CXL_POISON_TRACE_INJECT,
+ CXL_POISON_TRACE_CLEAR,
+};
+
+enum poison_cmd_enabled_bits;
+bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd,
+ enum poison_cmd_enabled_bits cmd);
+
+long cxl_pci_get_latency(struct pci_dev *pdev);
+int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c);
+int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
+ struct access_coordinate *c);
+
+int cxl_ras_init(void);
+void cxl_ras_exit(void);
+int cxl_gpf_port_setup(struct cxl_dport *dport);
+
+struct cxl_hdm;
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+ struct cxl_endpoint_dvsec_info *info);
+int cxl_port_get_possible_dports(struct cxl_port *port);
+
+#ifdef CONFIG_CXL_FEATURES
+struct cxl_feat_entry *
+cxl_feature_info(struct cxl_features_state *cxlfs, const uuid_t *uuid);
+size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
+ enum cxl_get_feat_selection selection,
+ void *feat_out, size_t feat_out_size, u16 offset,
+ u16 *return_code);
+int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
+ u8 feat_version, const void *feat_data,
+ size_t feat_data_size, u32 feat_flag, u16 offset,
+ u16 *return_code);
+#endif
+
+#endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c
new file mode 100644
index 000000000000..79994ca9bc9f
--- /dev/null
+++ b/drivers/cxl/core/edac.c
@@ -0,0 +1,2109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * CXL EDAC memory feature driver.
+ *
+ * Copyright (c) 2024-2025 HiSilicon Limited.
+ *
+ * - Supports functions to configure EDAC features of the
+ * CXL memory devices.
+ * - Registers with the EDAC device subsystem driver to expose
+ * the features sysfs attributes to the user for configuring
+ * CXL memory RAS feature.
+ */
+
+#include <linux/cleanup.h>
+#include <linux/edac.h>
+#include <linux/limits.h>
+#include <linux/unaligned.h>
+#include <linux/xarray.h>
+#include <cxl/features.h>
+#include <cxl.h>
+#include <cxlmem.h>
+#include "core.h"
+#include "trace.h"
+
+#define CXL_NR_EDAC_DEV_FEATURES 7
+
+#define CXL_SCRUB_NO_REGION -1
+
+struct cxl_patrol_scrub_context {
+ u8 instance;
+ u16 get_feat_size;
+ u16 set_feat_size;
+ u8 get_version;
+ u8 set_version;
+ u16 effects;
+ struct cxl_memdev *cxlmd;
+ struct cxl_region *cxlr;
+};
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.9.11.1 Table 8-222 Device Patrol Scrub Control
+ * Feature Readable Attributes.
+ */
+struct cxl_scrub_rd_attrbs {
+ u8 scrub_cycle_cap;
+ __le16 scrub_cycle_hours;
+ u8 scrub_flags;
+} __packed;
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.9.11.1 Table 8-223 Device Patrol Scrub Control
+ * Feature Writable Attributes.
+ */
+struct cxl_scrub_wr_attrbs {
+ u8 scrub_cycle_hours;
+ u8 scrub_flags;
+} __packed;
+
+#define CXL_SCRUB_CONTROL_CHANGEABLE BIT(0)
+#define CXL_SCRUB_CONTROL_REALTIME BIT(1)
+#define CXL_SCRUB_CONTROL_CYCLE_MASK GENMASK(7, 0)
+#define CXL_SCRUB_CONTROL_MIN_CYCLE_MASK GENMASK(15, 8)
+#define CXL_SCRUB_CONTROL_ENABLE BIT(0)
+
+#define CXL_GET_SCRUB_CYCLE_CHANGEABLE(cap) \
+ FIELD_GET(CXL_SCRUB_CONTROL_CHANGEABLE, cap)
+#define CXL_GET_SCRUB_CYCLE(cycle) \
+ FIELD_GET(CXL_SCRUB_CONTROL_CYCLE_MASK, cycle)
+#define CXL_GET_SCRUB_MIN_CYCLE(cycle) \
+ FIELD_GET(CXL_SCRUB_CONTROL_MIN_CYCLE_MASK, cycle)
+#define CXL_GET_SCRUB_EN_STS(flags) FIELD_GET(CXL_SCRUB_CONTROL_ENABLE, flags)
+
+#define CXL_SET_SCRUB_CYCLE(cycle) \
+ FIELD_PREP(CXL_SCRUB_CONTROL_CYCLE_MASK, cycle)
+#define CXL_SET_SCRUB_EN(en) FIELD_PREP(CXL_SCRUB_CONTROL_ENABLE, en)
+
+static int cxl_mem_scrub_get_attrbs(struct cxl_mailbox *cxl_mbox, u8 *cap,
+ u16 *cycle, u8 *flags, u8 *min_cycle)
+{
+ size_t rd_data_size = sizeof(struct cxl_scrub_rd_attrbs);
+ size_t data_size;
+ struct cxl_scrub_rd_attrbs *rd_attrbs __free(kfree) =
+ kzalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrbs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs,
+ rd_data_size, 0, NULL);
+ if (!data_size)
+ return -EIO;
+
+ *cap = rd_attrbs->scrub_cycle_cap;
+ *cycle = le16_to_cpu(rd_attrbs->scrub_cycle_hours);
+ *flags = rd_attrbs->scrub_flags;
+ if (min_cycle)
+ *min_cycle = CXL_GET_SCRUB_MIN_CYCLE(*cycle);
+
+ return 0;
+}
+
+static int cxl_scrub_get_attrbs(struct cxl_patrol_scrub_context *cxl_ps_ctx,
+ u8 *cap, u16 *cycle, u8 *flags, u8 *min_cycle)
+{
+ struct cxl_mailbox *cxl_mbox;
+ struct cxl_region_params *p;
+ struct cxl_memdev *cxlmd;
+ struct cxl_region *cxlr;
+ u8 min_scrub_cycle = 0;
+ int i, ret;
+
+ if (!cxl_ps_ctx->cxlr) {
+ cxl_mbox = &cxl_ps_ctx->cxlmd->cxlds->cxl_mbox;
+ return cxl_mem_scrub_get_attrbs(cxl_mbox, cap, cycle,
+ flags, min_cycle);
+ }
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return ret;
+
+ cxlr = cxl_ps_ctx->cxlr;
+ p = &cxlr->params;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+ cxlmd = cxled_to_memdev(cxled);
+ cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ ret = cxl_mem_scrub_get_attrbs(cxl_mbox, cap, cycle, flags,
+ min_cycle);
+ if (ret)
+ return ret;
+
+ /*
+ * The min_scrub_cycle of a region is the max of minimum scrub
+ * cycles supported by memdevs that back the region.
+ */
+ if (min_cycle)
+ min_scrub_cycle = max(*min_cycle, min_scrub_cycle);
+ }
+
+ if (min_cycle)
+ *min_cycle = min_scrub_cycle;
+
+ return 0;
+}
+
+static int cxl_scrub_set_attrbs_region(struct device *dev,
+ struct cxl_patrol_scrub_context *cxl_ps_ctx,
+ u8 cycle, u8 flags)
+{
+ struct cxl_scrub_wr_attrbs wr_attrbs;
+ struct cxl_mailbox *cxl_mbox;
+ struct cxl_region_params *p;
+ struct cxl_memdev *cxlmd;
+ struct cxl_region *cxlr;
+ int ret, i;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return ret;
+
+ cxlr = cxl_ps_ctx->cxlr;
+ p = &cxlr->params;
+ wr_attrbs.scrub_cycle_hours = cycle;
+ wr_attrbs.scrub_flags = flags;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+ cxlmd = cxled_to_memdev(cxled);
+ cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ ret = cxl_set_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID,
+ cxl_ps_ctx->set_version, &wr_attrbs,
+ sizeof(wr_attrbs),
+ CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET,
+ 0, NULL);
+ if (ret)
+ return ret;
+
+ if (cycle != cxlmd->scrub_cycle) {
+ if (cxlmd->scrub_region_id != CXL_SCRUB_NO_REGION)
+ dev_info(dev,
+ "Device scrub rate(%d hours) set by region%d rate overwritten by region%d scrub rate(%d hours)\n",
+ cxlmd->scrub_cycle,
+ cxlmd->scrub_region_id, cxlr->id,
+ cycle);
+
+ cxlmd->scrub_cycle = cycle;
+ cxlmd->scrub_region_id = cxlr->id;
+ }
+ }
+
+ return 0;
+}
+
+static int cxl_scrub_set_attrbs_device(struct device *dev,
+ struct cxl_patrol_scrub_context *cxl_ps_ctx,
+ u8 cycle, u8 flags)
+{
+ struct cxl_scrub_wr_attrbs wr_attrbs;
+ struct cxl_mailbox *cxl_mbox;
+ struct cxl_memdev *cxlmd;
+ int ret;
+
+ wr_attrbs.scrub_cycle_hours = cycle;
+ wr_attrbs.scrub_flags = flags;
+
+ cxlmd = cxl_ps_ctx->cxlmd;
+ cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ ret = cxl_set_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID,
+ cxl_ps_ctx->set_version, &wr_attrbs,
+ sizeof(wr_attrbs),
+ CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET, 0,
+ NULL);
+ if (ret)
+ return ret;
+
+ if (cycle != cxlmd->scrub_cycle) {
+ if (cxlmd->scrub_region_id != CXL_SCRUB_NO_REGION)
+ dev_info(dev,
+ "Device scrub rate(%d hours) set by region%d rate overwritten with device local scrub rate(%d hours)\n",
+ cxlmd->scrub_cycle, cxlmd->scrub_region_id,
+ cycle);
+
+ cxlmd->scrub_cycle = cycle;
+ cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION;
+ }
+
+ return 0;
+}
+
+static int cxl_scrub_set_attrbs(struct device *dev,
+ struct cxl_patrol_scrub_context *cxl_ps_ctx,
+ u8 cycle, u8 flags)
+{
+ if (cxl_ps_ctx->cxlr)
+ return cxl_scrub_set_attrbs_region(dev, cxl_ps_ctx, cycle, flags);
+
+ return cxl_scrub_set_attrbs_device(dev, cxl_ps_ctx, cycle, flags);
+}
+
+static int cxl_patrol_scrub_get_enabled_bg(struct device *dev, void *drv_data,
+ bool *enabled)
+{
+ struct cxl_patrol_scrub_context *ctx = drv_data;
+ u8 cap, flags;
+ u16 cycle;
+ int ret;
+
+ ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, NULL);
+ if (ret)
+ return ret;
+
+ *enabled = CXL_GET_SCRUB_EN_STS(flags);
+
+ return 0;
+}
+
+static int cxl_patrol_scrub_set_enabled_bg(struct device *dev, void *drv_data,
+ bool enable)
+{
+ struct cxl_patrol_scrub_context *ctx = drv_data;
+ u8 cap, flags, wr_cycle;
+ u16 rd_cycle;
+ int ret;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ ret = cxl_scrub_get_attrbs(ctx, &cap, &rd_cycle, &flags, NULL);
+ if (ret)
+ return ret;
+
+ wr_cycle = CXL_GET_SCRUB_CYCLE(rd_cycle);
+ flags = CXL_SET_SCRUB_EN(enable);
+
+ return cxl_scrub_set_attrbs(dev, ctx, wr_cycle, flags);
+}
+
+static int cxl_patrol_scrub_get_min_scrub_cycle(struct device *dev,
+ void *drv_data, u32 *min)
+{
+ struct cxl_patrol_scrub_context *ctx = drv_data;
+ u8 cap, flags, min_cycle;
+ u16 cycle;
+ int ret;
+
+ ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, &min_cycle);
+ if (ret)
+ return ret;
+
+ *min = min_cycle * 3600;
+
+ return 0;
+}
+
+static int cxl_patrol_scrub_get_max_scrub_cycle(struct device *dev,
+ void *drv_data, u32 *max)
+{
+ *max = U8_MAX * 3600; /* Max set by register size */
+
+ return 0;
+}
+
+static int cxl_patrol_scrub_get_scrub_cycle(struct device *dev, void *drv_data,
+ u32 *scrub_cycle_secs)
+{
+ struct cxl_patrol_scrub_context *ctx = drv_data;
+ u8 cap, flags;
+ u16 cycle;
+ int ret;
+
+ ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, NULL);
+ if (ret)
+ return ret;
+
+ *scrub_cycle_secs = CXL_GET_SCRUB_CYCLE(cycle) * 3600;
+
+ return 0;
+}
+
+static int cxl_patrol_scrub_set_scrub_cycle(struct device *dev, void *drv_data,
+ u32 scrub_cycle_secs)
+{
+ struct cxl_patrol_scrub_context *ctx = drv_data;
+ u8 scrub_cycle_hours = scrub_cycle_secs / 3600;
+ u8 cap, wr_cycle, flags, min_cycle;
+ u16 rd_cycle;
+ int ret;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ ret = cxl_scrub_get_attrbs(ctx, &cap, &rd_cycle, &flags, &min_cycle);
+ if (ret)
+ return ret;
+
+ if (!CXL_GET_SCRUB_CYCLE_CHANGEABLE(cap))
+ return -EOPNOTSUPP;
+
+ if (scrub_cycle_hours < min_cycle) {
+ dev_dbg(dev, "Invalid CXL patrol scrub cycle(%d) to set\n",
+ scrub_cycle_hours);
+ dev_dbg(dev,
+ "Minimum supported CXL patrol scrub cycle in hour %d\n",
+ min_cycle);
+ return -EINVAL;
+ }
+ wr_cycle = CXL_SET_SCRUB_CYCLE(scrub_cycle_hours);
+
+ return cxl_scrub_set_attrbs(dev, ctx, wr_cycle, flags);
+}
+
+static const struct edac_scrub_ops cxl_ps_scrub_ops = {
+ .get_enabled_bg = cxl_patrol_scrub_get_enabled_bg,
+ .set_enabled_bg = cxl_patrol_scrub_set_enabled_bg,
+ .get_min_cycle = cxl_patrol_scrub_get_min_scrub_cycle,
+ .get_max_cycle = cxl_patrol_scrub_get_max_scrub_cycle,
+ .get_cycle_duration = cxl_patrol_scrub_get_scrub_cycle,
+ .set_cycle_duration = cxl_patrol_scrub_set_scrub_cycle,
+};
+
+static int cxl_memdev_scrub_init(struct cxl_memdev *cxlmd,
+ struct edac_dev_feature *ras_feature,
+ u8 scrub_inst)
+{
+ struct cxl_patrol_scrub_context *cxl_ps_ctx;
+ struct cxl_feat_entry *feat_entry;
+ u8 cap, flags;
+ u16 cycle;
+ int rc;
+
+ feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds),
+ &CXL_FEAT_PATROL_SCRUB_UUID);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ cxl_ps_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ps_ctx), GFP_KERNEL);
+ if (!cxl_ps_ctx)
+ return -ENOMEM;
+
+ *cxl_ps_ctx = (struct cxl_patrol_scrub_context){
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .instance = scrub_inst,
+ .cxlmd = cxlmd,
+ };
+
+ rc = cxl_mem_scrub_get_attrbs(&cxlmd->cxlds->cxl_mbox, &cap, &cycle,
+ &flags, NULL);
+ if (rc)
+ return rc;
+
+ cxlmd->scrub_cycle = CXL_GET_SCRUB_CYCLE(cycle);
+ cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION;
+
+ ras_feature->ft_type = RAS_FEAT_SCRUB;
+ ras_feature->instance = cxl_ps_ctx->instance;
+ ras_feature->scrub_ops = &cxl_ps_scrub_ops;
+ ras_feature->ctx = cxl_ps_ctx;
+
+ return 0;
+}
+
+static int cxl_region_scrub_init(struct cxl_region *cxlr,
+ struct edac_dev_feature *ras_feature,
+ u8 scrub_inst)
+{
+ struct cxl_patrol_scrub_context *cxl_ps_ctx;
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_feat_entry *feat_entry = NULL;
+ struct cxl_memdev *cxlmd;
+ u8 cap, flags;
+ u16 cycle;
+ int i, rc;
+
+ /*
+ * The cxl_region_rwsem must be held if the code below is used in a context
+ * other than when the region is in the probe state, as shown here.
+ */
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+ cxlmd = cxled_to_memdev(cxled);
+ feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds),
+ &CXL_FEAT_PATROL_SCRUB_UUID);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) &
+ CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ rc = cxl_mem_scrub_get_attrbs(&cxlmd->cxlds->cxl_mbox, &cap,
+ &cycle, &flags, NULL);
+ if (rc)
+ return rc;
+
+ cxlmd->scrub_cycle = CXL_GET_SCRUB_CYCLE(cycle);
+ cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION;
+ }
+
+ cxl_ps_ctx = devm_kzalloc(&cxlr->dev, sizeof(*cxl_ps_ctx), GFP_KERNEL);
+ if (!cxl_ps_ctx)
+ return -ENOMEM;
+
+ *cxl_ps_ctx = (struct cxl_patrol_scrub_context){
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .instance = scrub_inst,
+ .cxlr = cxlr,
+ };
+
+ ras_feature->ft_type = RAS_FEAT_SCRUB;
+ ras_feature->instance = cxl_ps_ctx->instance;
+ ras_feature->scrub_ops = &cxl_ps_scrub_ops;
+ ras_feature->ctx = cxl_ps_ctx;
+
+ return 0;
+}
+
+struct cxl_ecs_context {
+ u16 num_media_frus;
+ u16 get_feat_size;
+ u16 set_feat_size;
+ u8 get_version;
+ u8 set_version;
+ u16 effects;
+ struct cxl_memdev *cxlmd;
+};
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.9.11.2 Table 8-225 DDR5 ECS Control Feature
+ * Readable Attributes.
+ */
+struct cxl_ecs_fru_rd_attrbs {
+ u8 ecs_cap;
+ __le16 ecs_config;
+ u8 ecs_flags;
+} __packed;
+
+struct cxl_ecs_rd_attrbs {
+ u8 ecs_log_cap;
+ struct cxl_ecs_fru_rd_attrbs fru_attrbs[];
+} __packed;
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.9.11.2 Table 8-226 DDR5 ECS Control Feature
+ * Writable Attributes.
+ */
+struct cxl_ecs_fru_wr_attrbs {
+ __le16 ecs_config;
+} __packed;
+
+struct cxl_ecs_wr_attrbs {
+ u8 ecs_log_cap;
+ struct cxl_ecs_fru_wr_attrbs fru_attrbs[];
+} __packed;
+
+#define CXL_ECS_LOG_ENTRY_TYPE_MASK GENMASK(1, 0)
+#define CXL_ECS_REALTIME_REPORT_CAP_MASK BIT(0)
+#define CXL_ECS_THRESHOLD_COUNT_MASK GENMASK(2, 0)
+#define CXL_ECS_COUNT_MODE_MASK BIT(3)
+#define CXL_ECS_RESET_COUNTER_MASK BIT(4)
+#define CXL_ECS_RESET_COUNTER 1
+
+enum {
+ ECS_THRESHOLD_256 = 256,
+ ECS_THRESHOLD_1024 = 1024,
+ ECS_THRESHOLD_4096 = 4096,
+};
+
+enum {
+ ECS_THRESHOLD_IDX_256 = 3,
+ ECS_THRESHOLD_IDX_1024 = 4,
+ ECS_THRESHOLD_IDX_4096 = 5,
+};
+
+static const u16 ecs_supp_threshold[] = {
+ [ECS_THRESHOLD_IDX_256] = 256,
+ [ECS_THRESHOLD_IDX_1024] = 1024,
+ [ECS_THRESHOLD_IDX_4096] = 4096,
+};
+
+enum {
+ ECS_LOG_ENTRY_TYPE_DRAM = 0x0,
+ ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU = 0x1,
+};
+
+enum cxl_ecs_count_mode {
+ ECS_MODE_COUNTS_ROWS = 0,
+ ECS_MODE_COUNTS_CODEWORDS = 1,
+};
+
+static int cxl_mem_ecs_get_attrbs(struct device *dev,
+ struct cxl_ecs_context *cxl_ecs_ctx,
+ int fru_id, u8 *log_cap, u16 *config)
+{
+ struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_ecs_fru_rd_attrbs *fru_rd_attrbs;
+ size_t rd_data_size;
+ size_t data_size;
+
+ rd_data_size = cxl_ecs_ctx->get_feat_size;
+
+ struct cxl_ecs_rd_attrbs *rd_attrbs __free(kvfree) =
+ kvzalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrbs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_ECS_UUID,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs,
+ rd_data_size, 0, NULL);
+ if (!data_size)
+ return -EIO;
+
+ fru_rd_attrbs = rd_attrbs->fru_attrbs;
+ *log_cap = rd_attrbs->ecs_log_cap;
+ *config = le16_to_cpu(fru_rd_attrbs[fru_id].ecs_config);
+
+ return 0;
+}
+
+static int cxl_mem_ecs_set_attrbs(struct device *dev,
+ struct cxl_ecs_context *cxl_ecs_ctx,
+ int fru_id, u8 log_cap, u16 config)
+{
+ struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_ecs_fru_rd_attrbs *fru_rd_attrbs;
+ struct cxl_ecs_fru_wr_attrbs *fru_wr_attrbs;
+ size_t rd_data_size, wr_data_size;
+ u16 num_media_frus, count;
+ size_t data_size;
+
+ num_media_frus = cxl_ecs_ctx->num_media_frus;
+ rd_data_size = cxl_ecs_ctx->get_feat_size;
+ wr_data_size = cxl_ecs_ctx->set_feat_size;
+ struct cxl_ecs_rd_attrbs *rd_attrbs __free(kvfree) =
+ kvzalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrbs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_ECS_UUID,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs,
+ rd_data_size, 0, NULL);
+ if (!data_size)
+ return -EIO;
+
+ struct cxl_ecs_wr_attrbs *wr_attrbs __free(kvfree) =
+ kvzalloc(wr_data_size, GFP_KERNEL);
+ if (!wr_attrbs)
+ return -ENOMEM;
+
+ /*
+ * Fill writable attributes from the current attributes read
+ * for all the media FRUs.
+ */
+ fru_rd_attrbs = rd_attrbs->fru_attrbs;
+ fru_wr_attrbs = wr_attrbs->fru_attrbs;
+ wr_attrbs->ecs_log_cap = log_cap;
+ for (count = 0; count < num_media_frus; count++)
+ fru_wr_attrbs[count].ecs_config =
+ fru_rd_attrbs[count].ecs_config;
+
+ fru_wr_attrbs[fru_id].ecs_config = cpu_to_le16(config);
+
+ return cxl_set_feature(cxl_mbox, &CXL_FEAT_ECS_UUID,
+ cxl_ecs_ctx->set_version, wr_attrbs,
+ wr_data_size,
+ CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET,
+ 0, NULL);
+}
+
+static u8 cxl_get_ecs_log_entry_type(u8 log_cap, u16 config)
+{
+ return FIELD_GET(CXL_ECS_LOG_ENTRY_TYPE_MASK, log_cap);
+}
+
+static u16 cxl_get_ecs_threshold(u8 log_cap, u16 config)
+{
+ u8 index = FIELD_GET(CXL_ECS_THRESHOLD_COUNT_MASK, config);
+
+ return ecs_supp_threshold[index];
+}
+
+static u8 cxl_get_ecs_count_mode(u8 log_cap, u16 config)
+{
+ return FIELD_GET(CXL_ECS_COUNT_MODE_MASK, config);
+}
+
+#define CXL_ECS_GET_ATTR(attrb) \
+ static int cxl_ecs_get_##attrb(struct device *dev, void *drv_data, \
+ int fru_id, u32 *val) \
+ { \
+ struct cxl_ecs_context *ctx = drv_data; \
+ u8 log_cap; \
+ u16 config; \
+ int ret; \
+ \
+ ret = cxl_mem_ecs_get_attrbs(dev, ctx, fru_id, &log_cap, \
+ &config); \
+ if (ret) \
+ return ret; \
+ \
+ *val = cxl_get_ecs_##attrb(log_cap, config); \
+ \
+ return 0; \
+ }
+
+CXL_ECS_GET_ATTR(log_entry_type)
+CXL_ECS_GET_ATTR(count_mode)
+CXL_ECS_GET_ATTR(threshold)
+
+static int cxl_set_ecs_log_entry_type(struct device *dev, u8 *log_cap,
+ u16 *config, u32 val)
+{
+ if (val != ECS_LOG_ENTRY_TYPE_DRAM &&
+ val != ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU)
+ return -EINVAL;
+
+ *log_cap = FIELD_PREP(CXL_ECS_LOG_ENTRY_TYPE_MASK, val);
+
+ return 0;
+}
+
+static int cxl_set_ecs_threshold(struct device *dev, u8 *log_cap, u16 *config,
+ u32 val)
+{
+ *config &= ~CXL_ECS_THRESHOLD_COUNT_MASK;
+
+ switch (val) {
+ case ECS_THRESHOLD_256:
+ *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
+ ECS_THRESHOLD_IDX_256);
+ break;
+ case ECS_THRESHOLD_1024:
+ *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
+ ECS_THRESHOLD_IDX_1024);
+ break;
+ case ECS_THRESHOLD_4096:
+ *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
+ ECS_THRESHOLD_IDX_4096);
+ break;
+ default:
+ dev_dbg(dev, "Invalid CXL ECS threshold count(%u) to set\n",
+ val);
+ dev_dbg(dev, "Supported ECS threshold counts: %u, %u, %u\n",
+ ECS_THRESHOLD_256, ECS_THRESHOLD_1024,
+ ECS_THRESHOLD_4096);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int cxl_set_ecs_count_mode(struct device *dev, u8 *log_cap, u16 *config,
+ u32 val)
+{
+ if (val != ECS_MODE_COUNTS_ROWS && val != ECS_MODE_COUNTS_CODEWORDS) {
+ dev_dbg(dev, "Invalid CXL ECS scrub mode(%d) to set\n", val);
+ dev_dbg(dev,
+ "Supported ECS Modes: 0: ECS counts rows with errors,"
+ " 1: ECS counts codewords with errors\n");
+ return -EINVAL;
+ }
+
+ *config &= ~CXL_ECS_COUNT_MODE_MASK;
+ *config |= FIELD_PREP(CXL_ECS_COUNT_MODE_MASK, val);
+
+ return 0;
+}
+
+static int cxl_set_ecs_reset_counter(struct device *dev, u8 *log_cap,
+ u16 *config, u32 val)
+{
+ if (val != CXL_ECS_RESET_COUNTER)
+ return -EINVAL;
+
+ *config &= ~CXL_ECS_RESET_COUNTER_MASK;
+ *config |= FIELD_PREP(CXL_ECS_RESET_COUNTER_MASK, val);
+
+ return 0;
+}
+
+#define CXL_ECS_SET_ATTR(attrb) \
+ static int cxl_ecs_set_##attrb(struct device *dev, void *drv_data, \
+ int fru_id, u32 val) \
+ { \
+ struct cxl_ecs_context *ctx = drv_data; \
+ u8 log_cap; \
+ u16 config; \
+ int ret; \
+ \
+ if (!capable(CAP_SYS_RAWIO)) \
+ return -EPERM; \
+ \
+ ret = cxl_mem_ecs_get_attrbs(dev, ctx, fru_id, &log_cap, \
+ &config); \
+ if (ret) \
+ return ret; \
+ \
+ ret = cxl_set_ecs_##attrb(dev, &log_cap, &config, val); \
+ if (ret) \
+ return ret; \
+ \
+ return cxl_mem_ecs_set_attrbs(dev, ctx, fru_id, log_cap, \
+ config); \
+ }
+CXL_ECS_SET_ATTR(log_entry_type)
+CXL_ECS_SET_ATTR(count_mode)
+CXL_ECS_SET_ATTR(reset_counter)
+CXL_ECS_SET_ATTR(threshold)
+
+static const struct edac_ecs_ops cxl_ecs_ops = {
+ .get_log_entry_type = cxl_ecs_get_log_entry_type,
+ .set_log_entry_type = cxl_ecs_set_log_entry_type,
+ .get_mode = cxl_ecs_get_count_mode,
+ .set_mode = cxl_ecs_set_count_mode,
+ .reset = cxl_ecs_set_reset_counter,
+ .get_threshold = cxl_ecs_get_threshold,
+ .set_threshold = cxl_ecs_set_threshold,
+};
+
+static int cxl_memdev_ecs_init(struct cxl_memdev *cxlmd,
+ struct edac_dev_feature *ras_feature)
+{
+ struct cxl_ecs_context *cxl_ecs_ctx;
+ struct cxl_feat_entry *feat_entry;
+ int num_media_frus;
+
+ feat_entry =
+ cxl_feature_info(to_cxlfs(cxlmd->cxlds), &CXL_FEAT_ECS_UUID);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ num_media_frus = (le16_to_cpu(feat_entry->get_feat_size) -
+ sizeof(struct cxl_ecs_rd_attrbs)) /
+ sizeof(struct cxl_ecs_fru_rd_attrbs);
+ if (!num_media_frus)
+ return -EOPNOTSUPP;
+
+ cxl_ecs_ctx =
+ devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ecs_ctx), GFP_KERNEL);
+ if (!cxl_ecs_ctx)
+ return -ENOMEM;
+
+ *cxl_ecs_ctx = (struct cxl_ecs_context){
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .num_media_frus = num_media_frus,
+ .cxlmd = cxlmd,
+ };
+
+ ras_feature->ft_type = RAS_FEAT_ECS;
+ ras_feature->ecs_ops = &cxl_ecs_ops;
+ ras_feature->ctx = cxl_ecs_ctx;
+ ras_feature->ecs_info.num_media_frus = num_media_frus;
+
+ return 0;
+}
+
+/*
+ * Perform Maintenance CXL 3.2 Spec 8.2.10.7.1
+ */
+
+/*
+ * Perform Maintenance input payload
+ * CXL rev 3.2 section 8.2.10.7.1 Table 8-117
+ */
+struct cxl_mbox_maintenance_hdr {
+ u8 op_class;
+ u8 op_subclass;
+} __packed;
+
+static int cxl_perform_maintenance(struct cxl_mailbox *cxl_mbox, u8 class,
+ u8 subclass, void *data_in,
+ size_t data_in_size)
+{
+ struct cxl_memdev_maintenance_pi {
+ struct cxl_mbox_maintenance_hdr hdr;
+ u8 data[];
+ } __packed;
+ struct cxl_mbox_cmd mbox_cmd;
+ size_t hdr_size;
+
+ struct cxl_memdev_maintenance_pi *pi __free(kvfree) =
+ kvzalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!pi)
+ return -ENOMEM;
+
+ pi->hdr.op_class = class;
+ pi->hdr.op_subclass = subclass;
+ hdr_size = sizeof(pi->hdr);
+ /*
+ * Check minimum mbox payload size is available for
+ * the maintenance data transfer.
+ */
+ if (hdr_size + data_in_size > cxl_mbox->payload_size)
+ return -ENOMEM;
+
+ memcpy(pi->data, data_in, data_in_size);
+ mbox_cmd = (struct cxl_mbox_cmd){
+ .opcode = CXL_MBOX_OP_DO_MAINTENANCE,
+ .size_in = hdr_size + data_in_size,
+ .payload_in = pi,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+/*
+ * Support for finding a memory operation attributes
+ * are from the current boot or not.
+ */
+
+struct cxl_mem_err_rec {
+ struct xarray rec_gen_media;
+ struct xarray rec_dram;
+};
+
+enum cxl_mem_repair_type {
+ CXL_PPR,
+ CXL_CACHELINE_SPARING,
+ CXL_ROW_SPARING,
+ CXL_BANK_SPARING,
+ CXL_RANK_SPARING,
+ CXL_REPAIR_MAX,
+};
+
+/**
+ * struct cxl_mem_repair_attrbs - CXL memory repair attributes
+ * @dpa: DPA of memory to repair
+ * @nibble_mask: nibble mask, identifies one or more nibbles on the memory bus
+ * @row: row of memory to repair
+ * @column: column of memory to repair
+ * @channel: channel of memory to repair
+ * @sub_channel: sub channel of memory to repair
+ * @rank: rank of memory to repair
+ * @bank_group: bank group of memory to repair
+ * @bank: bank of memory to repair
+ * @repair_type: repair type. For eg. PPR, memory sparing etc.
+ */
+struct cxl_mem_repair_attrbs {
+ u64 dpa;
+ u32 nibble_mask;
+ u32 row;
+ u16 column;
+ u8 channel;
+ u8 sub_channel;
+ u8 rank;
+ u8 bank_group;
+ u8 bank;
+ enum cxl_mem_repair_type repair_type;
+};
+
+static struct cxl_event_gen_media *
+cxl_find_rec_gen_media(struct cxl_memdev *cxlmd,
+ struct cxl_mem_repair_attrbs *attrbs)
+{
+ struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
+ struct cxl_event_gen_media *rec;
+
+ if (!array_rec)
+ return NULL;
+
+ rec = xa_load(&array_rec->rec_gen_media, attrbs->dpa);
+ if (!rec)
+ return NULL;
+
+ if (attrbs->repair_type == CXL_PPR)
+ return rec;
+
+ return NULL;
+}
+
+static struct cxl_event_dram *
+cxl_find_rec_dram(struct cxl_memdev *cxlmd,
+ struct cxl_mem_repair_attrbs *attrbs)
+{
+ struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
+ struct cxl_event_dram *rec;
+ u16 validity_flags;
+
+ if (!array_rec)
+ return NULL;
+
+ rec = xa_load(&array_rec->rec_dram, attrbs->dpa);
+ if (!rec)
+ return NULL;
+
+ validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags);
+ if (!(validity_flags & CXL_DER_VALID_CHANNEL) ||
+ !(validity_flags & CXL_DER_VALID_RANK))
+ return NULL;
+
+ switch (attrbs->repair_type) {
+ case CXL_PPR:
+ if (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
+ get_unaligned_le24(rec->nibble_mask) == attrbs->nibble_mask)
+ return rec;
+ break;
+ case CXL_CACHELINE_SPARING:
+ if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(validity_flags & CXL_DER_VALID_BANK) ||
+ !(validity_flags & CXL_DER_VALID_ROW) ||
+ !(validity_flags & CXL_DER_VALID_COLUMN))
+ return NULL;
+
+ if (rec->media_hdr.channel == attrbs->channel &&
+ rec->media_hdr.rank == attrbs->rank &&
+ rec->bank_group == attrbs->bank_group &&
+ rec->bank == attrbs->bank &&
+ get_unaligned_le24(rec->row) == attrbs->row &&
+ get_unaligned_le16(rec->column) == attrbs->column &&
+ (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
+ get_unaligned_le24(rec->nibble_mask) ==
+ attrbs->nibble_mask) &&
+ (!(validity_flags & CXL_DER_VALID_SUB_CHANNEL) ||
+ rec->sub_channel == attrbs->sub_channel))
+ return rec;
+ break;
+ case CXL_ROW_SPARING:
+ if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(validity_flags & CXL_DER_VALID_BANK) ||
+ !(validity_flags & CXL_DER_VALID_ROW))
+ return NULL;
+
+ if (rec->media_hdr.channel == attrbs->channel &&
+ rec->media_hdr.rank == attrbs->rank &&
+ rec->bank_group == attrbs->bank_group &&
+ rec->bank == attrbs->bank &&
+ get_unaligned_le24(rec->row) == attrbs->row &&
+ (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
+ get_unaligned_le24(rec->nibble_mask) ==
+ attrbs->nibble_mask))
+ return rec;
+ break;
+ case CXL_BANK_SPARING:
+ if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) ||
+ !(validity_flags & CXL_DER_VALID_BANK))
+ return NULL;
+
+ if (rec->media_hdr.channel == attrbs->channel &&
+ rec->media_hdr.rank == attrbs->rank &&
+ rec->bank_group == attrbs->bank_group &&
+ rec->bank == attrbs->bank &&
+ (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
+ get_unaligned_le24(rec->nibble_mask) ==
+ attrbs->nibble_mask))
+ return rec;
+ break;
+ case CXL_RANK_SPARING:
+ if (rec->media_hdr.channel == attrbs->channel &&
+ rec->media_hdr.rank == attrbs->rank &&
+ (!(validity_flags & CXL_DER_VALID_NIBBLE) ||
+ get_unaligned_le24(rec->nibble_mask) ==
+ attrbs->nibble_mask))
+ return rec;
+ break;
+ default:
+ return NULL;
+ }
+
+ return NULL;
+}
+
+#define CXL_MAX_STORAGE_DAYS 10
+#define CXL_MAX_STORAGE_TIME_SECS (CXL_MAX_STORAGE_DAYS * 24 * 60 * 60)
+
+static void cxl_del_expired_gmedia_recs(struct xarray *rec_xarray,
+ struct cxl_event_gen_media *cur_rec)
+{
+ u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp);
+ struct cxl_event_gen_media *rec;
+ unsigned long index;
+ u64 delta_ts_secs;
+
+ xa_for_each(rec_xarray, index, rec) {
+ delta_ts_secs = (cur_ts -
+ le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL;
+ if (delta_ts_secs >= CXL_MAX_STORAGE_TIME_SECS) {
+ xa_erase(rec_xarray, index);
+ kfree(rec);
+ }
+ }
+}
+
+static void cxl_del_expired_dram_recs(struct xarray *rec_xarray,
+ struct cxl_event_dram *cur_rec)
+{
+ u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp);
+ struct cxl_event_dram *rec;
+ unsigned long index;
+ u64 delta_secs;
+
+ xa_for_each(rec_xarray, index, rec) {
+ delta_secs = (cur_ts -
+ le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL;
+ if (delta_secs >= CXL_MAX_STORAGE_TIME_SECS) {
+ xa_erase(rec_xarray, index);
+ kfree(rec);
+ }
+ }
+}
+
+#define CXL_MAX_REC_STORAGE_COUNT 200
+
+static void cxl_del_overflow_old_recs(struct xarray *rec_xarray)
+{
+ void *err_rec;
+ unsigned long index, count = 0;
+
+ xa_for_each(rec_xarray, index, err_rec)
+ count++;
+
+ if (count <= CXL_MAX_REC_STORAGE_COUNT)
+ return;
+
+ count -= CXL_MAX_REC_STORAGE_COUNT;
+ xa_for_each(rec_xarray, index, err_rec) {
+ xa_erase(rec_xarray, index);
+ kfree(err_rec);
+ count--;
+ if (!count)
+ break;
+ }
+}
+
+int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt)
+{
+ struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
+ struct cxl_event_gen_media *rec;
+ void *old_rec;
+
+ if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
+ return 0;
+
+ rec = kmemdup(&evt->gen_media, sizeof(*rec), GFP_KERNEL);
+ if (!rec)
+ return -ENOMEM;
+
+ old_rec = xa_store(&array_rec->rec_gen_media,
+ le64_to_cpu(rec->media_hdr.phys_addr), rec,
+ GFP_KERNEL);
+ if (xa_is_err(old_rec)) {
+ kfree(rec);
+ return xa_err(old_rec);
+ }
+
+ kfree(old_rec);
+
+ cxl_del_expired_gmedia_recs(&array_rec->rec_gen_media, rec);
+ cxl_del_overflow_old_recs(&array_rec->rec_gen_media);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_store_rec_gen_media, "CXL");
+
+int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt)
+{
+ struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
+ struct cxl_event_dram *rec;
+ void *old_rec;
+
+ if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
+ return 0;
+
+ rec = kmemdup(&evt->dram, sizeof(*rec), GFP_KERNEL);
+ if (!rec)
+ return -ENOMEM;
+
+ old_rec = xa_store(&array_rec->rec_dram,
+ le64_to_cpu(rec->media_hdr.phys_addr), rec,
+ GFP_KERNEL);
+ if (xa_is_err(old_rec)) {
+ kfree(rec);
+ return xa_err(old_rec);
+ }
+
+ kfree(old_rec);
+
+ cxl_del_expired_dram_recs(&array_rec->rec_dram, rec);
+ cxl_del_overflow_old_recs(&array_rec->rec_dram);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_store_rec_dram, "CXL");
+
+static bool cxl_is_memdev_memory_online(const struct cxl_memdev *cxlmd)
+{
+ struct cxl_port *port = cxlmd->endpoint;
+
+ if (port && cxl_num_decoders_committed(port))
+ return true;
+
+ return false;
+}
+
+/*
+ * CXL memory sparing control
+ */
+enum cxl_mem_sparing_granularity {
+ CXL_MEM_SPARING_CACHELINE,
+ CXL_MEM_SPARING_ROW,
+ CXL_MEM_SPARING_BANK,
+ CXL_MEM_SPARING_RANK,
+ CXL_MEM_SPARING_MAX
+};
+
+struct cxl_mem_sparing_context {
+ struct cxl_memdev *cxlmd;
+ uuid_t repair_uuid;
+ u16 get_feat_size;
+ u16 set_feat_size;
+ u16 effects;
+ u8 instance;
+ u8 get_version;
+ u8 set_version;
+ u8 op_class;
+ u8 op_subclass;
+ bool cap_safe_when_in_use;
+ bool cap_hard_sparing;
+ bool cap_soft_sparing;
+ u8 channel;
+ u8 rank;
+ u8 bank_group;
+ u32 nibble_mask;
+ u64 dpa;
+ u32 row;
+ u16 column;
+ u8 bank;
+ u8 sub_channel;
+ enum edac_mem_repair_type repair_type;
+ bool persist_mode;
+};
+
+#define CXL_SPARING_RD_CAP_SAFE_IN_USE_MASK BIT(0)
+#define CXL_SPARING_RD_CAP_HARD_SPARING_MASK BIT(1)
+#define CXL_SPARING_RD_CAP_SOFT_SPARING_MASK BIT(2)
+
+#define CXL_SPARING_WR_DEVICE_INITIATED_MASK BIT(0)
+
+#define CXL_SPARING_QUERY_RESOURCE_FLAG BIT(0)
+#define CXL_SET_HARD_SPARING_FLAG BIT(1)
+#define CXL_SPARING_SUB_CHNL_VALID_FLAG BIT(2)
+#define CXL_SPARING_NIB_MASK_VALID_FLAG BIT(3)
+
+#define CXL_GET_SPARING_SAFE_IN_USE(flags) \
+ (FIELD_GET(CXL_SPARING_RD_CAP_SAFE_IN_USE_MASK, \
+ flags) ^ 1)
+#define CXL_GET_CAP_HARD_SPARING(flags) \
+ FIELD_GET(CXL_SPARING_RD_CAP_HARD_SPARING_MASK, \
+ flags)
+#define CXL_GET_CAP_SOFT_SPARING(flags) \
+ FIELD_GET(CXL_SPARING_RD_CAP_SOFT_SPARING_MASK, \
+ flags)
+
+#define CXL_SET_SPARING_QUERY_RESOURCE(val) \
+ FIELD_PREP(CXL_SPARING_QUERY_RESOURCE_FLAG, val)
+#define CXL_SET_HARD_SPARING(val) \
+ FIELD_PREP(CXL_SET_HARD_SPARING_FLAG, val)
+#define CXL_SET_SPARING_SUB_CHNL_VALID(val) \
+ FIELD_PREP(CXL_SPARING_SUB_CHNL_VALID_FLAG, val)
+#define CXL_SET_SPARING_NIB_MASK_VALID(val) \
+ FIELD_PREP(CXL_SPARING_NIB_MASK_VALID_FLAG, val)
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.7.2.3 Table 8-134 Memory Sparing Feature
+ * Readable Attributes.
+ */
+struct cxl_memdev_repair_rd_attrbs_hdr {
+ u8 max_op_latency;
+ __le16 op_cap;
+ __le16 op_mode;
+ u8 op_class;
+ u8 op_subclass;
+ u8 rsvd[9];
+} __packed;
+
+struct cxl_memdev_sparing_rd_attrbs {
+ struct cxl_memdev_repair_rd_attrbs_hdr hdr;
+ u8 rsvd;
+ __le16 restriction_flags;
+} __packed;
+
+/*
+ * See CXL spec rev 3.2 @8.2.10.7.1.4 Table 8-120 Memory Sparing Input Payload.
+ */
+struct cxl_memdev_sparing_in_payload {
+ u8 flags;
+ u8 channel;
+ u8 rank;
+ u8 nibble_mask[3];
+ u8 bank_group;
+ u8 bank;
+ u8 row[3];
+ __le16 column;
+ u8 sub_channel;
+} __packed;
+
+static int
+cxl_mem_sparing_get_attrbs(struct cxl_mem_sparing_context *cxl_sparing_ctx)
+{
+ size_t rd_data_size = sizeof(struct cxl_memdev_sparing_rd_attrbs);
+ struct cxl_memdev *cxlmd = cxl_sparing_ctx->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ u16 restriction_flags;
+ size_t data_size;
+ u16 return_code;
+ struct cxl_memdev_sparing_rd_attrbs *rd_attrbs __free(kfree) =
+ kzalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrbs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &cxl_sparing_ctx->repair_uuid,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs,
+ rd_data_size, 0, &return_code);
+ if (!data_size)
+ return -EIO;
+
+ cxl_sparing_ctx->op_class = rd_attrbs->hdr.op_class;
+ cxl_sparing_ctx->op_subclass = rd_attrbs->hdr.op_subclass;
+ restriction_flags = le16_to_cpu(rd_attrbs->restriction_flags);
+ cxl_sparing_ctx->cap_safe_when_in_use =
+ CXL_GET_SPARING_SAFE_IN_USE(restriction_flags);
+ cxl_sparing_ctx->cap_hard_sparing =
+ CXL_GET_CAP_HARD_SPARING(restriction_flags);
+ cxl_sparing_ctx->cap_soft_sparing =
+ CXL_GET_CAP_SOFT_SPARING(restriction_flags);
+
+ return 0;
+}
+
+static struct cxl_event_dram *
+cxl_mem_get_rec_dram(struct cxl_memdev *cxlmd,
+ struct cxl_mem_sparing_context *ctx)
+{
+ struct cxl_mem_repair_attrbs attrbs = { 0 };
+
+ attrbs.dpa = ctx->dpa;
+ attrbs.channel = ctx->channel;
+ attrbs.rank = ctx->rank;
+ attrbs.nibble_mask = ctx->nibble_mask;
+ switch (ctx->repair_type) {
+ case EDAC_REPAIR_CACHELINE_SPARING:
+ attrbs.repair_type = CXL_CACHELINE_SPARING;
+ attrbs.bank_group = ctx->bank_group;
+ attrbs.bank = ctx->bank;
+ attrbs.row = ctx->row;
+ attrbs.column = ctx->column;
+ attrbs.sub_channel = ctx->sub_channel;
+ break;
+ case EDAC_REPAIR_ROW_SPARING:
+ attrbs.repair_type = CXL_ROW_SPARING;
+ attrbs.bank_group = ctx->bank_group;
+ attrbs.bank = ctx->bank;
+ attrbs.row = ctx->row;
+ break;
+ case EDAC_REPAIR_BANK_SPARING:
+ attrbs.repair_type = CXL_BANK_SPARING;
+ attrbs.bank_group = ctx->bank_group;
+ attrbs.bank = ctx->bank;
+ break;
+ case EDAC_REPAIR_RANK_SPARING:
+ attrbs.repair_type = CXL_RANK_SPARING;
+ break;
+ default:
+ return NULL;
+ }
+
+ return cxl_find_rec_dram(cxlmd, &attrbs);
+}
+
+static int
+cxl_mem_perform_sparing(struct device *dev,
+ struct cxl_mem_sparing_context *cxl_sparing_ctx)
+{
+ struct cxl_memdev *cxlmd = cxl_sparing_ctx->cxlmd;
+ struct cxl_memdev_sparing_in_payload sparing_pi;
+ struct cxl_event_dram *rec = NULL;
+ u16 validity_flags = 0;
+ int ret;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return ret;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return ret;
+
+ if (!cxl_sparing_ctx->cap_safe_when_in_use) {
+ /* Memory to repair must be offline */
+ if (cxl_is_memdev_memory_online(cxlmd))
+ return -EBUSY;
+ } else {
+ if (cxl_is_memdev_memory_online(cxlmd)) {
+ rec = cxl_mem_get_rec_dram(cxlmd, cxl_sparing_ctx);
+ if (!rec)
+ return -EINVAL;
+
+ if (!get_unaligned_le16(rec->media_hdr.validity_flags))
+ return -EINVAL;
+ }
+ }
+
+ memset(&sparing_pi, 0, sizeof(sparing_pi));
+ sparing_pi.flags = CXL_SET_SPARING_QUERY_RESOURCE(0);
+ if (cxl_sparing_ctx->persist_mode)
+ sparing_pi.flags |= CXL_SET_HARD_SPARING(1);
+
+ if (rec)
+ validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags);
+
+ switch (cxl_sparing_ctx->repair_type) {
+ case EDAC_REPAIR_CACHELINE_SPARING:
+ sparing_pi.column = cpu_to_le16(cxl_sparing_ctx->column);
+ if (!rec || (validity_flags & CXL_DER_VALID_SUB_CHANNEL)) {
+ sparing_pi.flags |= CXL_SET_SPARING_SUB_CHNL_VALID(1);
+ sparing_pi.sub_channel = cxl_sparing_ctx->sub_channel;
+ }
+ fallthrough;
+ case EDAC_REPAIR_ROW_SPARING:
+ put_unaligned_le24(cxl_sparing_ctx->row, sparing_pi.row);
+ fallthrough;
+ case EDAC_REPAIR_BANK_SPARING:
+ sparing_pi.bank_group = cxl_sparing_ctx->bank_group;
+ sparing_pi.bank = cxl_sparing_ctx->bank;
+ fallthrough;
+ case EDAC_REPAIR_RANK_SPARING:
+ sparing_pi.rank = cxl_sparing_ctx->rank;
+ fallthrough;
+ default:
+ sparing_pi.channel = cxl_sparing_ctx->channel;
+ if ((rec && (validity_flags & CXL_DER_VALID_NIBBLE)) ||
+ (!rec && (!cxl_sparing_ctx->nibble_mask ||
+ (cxl_sparing_ctx->nibble_mask & 0xFFFFFF)))) {
+ sparing_pi.flags |= CXL_SET_SPARING_NIB_MASK_VALID(1);
+ put_unaligned_le24(cxl_sparing_ctx->nibble_mask,
+ sparing_pi.nibble_mask);
+ }
+ break;
+ }
+
+ return cxl_perform_maintenance(&cxlmd->cxlds->cxl_mbox,
+ cxl_sparing_ctx->op_class,
+ cxl_sparing_ctx->op_subclass,
+ &sparing_pi, sizeof(sparing_pi));
+}
+
+static int cxl_mem_sparing_get_repair_type(struct device *dev, void *drv_data,
+ const char **repair_type)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+
+ switch (ctx->repair_type) {
+ case EDAC_REPAIR_CACHELINE_SPARING:
+ case EDAC_REPAIR_ROW_SPARING:
+ case EDAC_REPAIR_BANK_SPARING:
+ case EDAC_REPAIR_RANK_SPARING:
+ *repair_type = edac_repair_type[ctx->repair_type];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define CXL_SPARING_GET_ATTR(attrb, data_type) \
+ static int cxl_mem_sparing_get_##attrb( \
+ struct device *dev, void *drv_data, data_type *val) \
+ { \
+ struct cxl_mem_sparing_context *ctx = drv_data; \
+ \
+ *val = ctx->attrb; \
+ \
+ return 0; \
+ }
+CXL_SPARING_GET_ATTR(persist_mode, bool)
+CXL_SPARING_GET_ATTR(dpa, u64)
+CXL_SPARING_GET_ATTR(nibble_mask, u32)
+CXL_SPARING_GET_ATTR(bank_group, u32)
+CXL_SPARING_GET_ATTR(bank, u32)
+CXL_SPARING_GET_ATTR(rank, u32)
+CXL_SPARING_GET_ATTR(row, u32)
+CXL_SPARING_GET_ATTR(column, u32)
+CXL_SPARING_GET_ATTR(channel, u32)
+CXL_SPARING_GET_ATTR(sub_channel, u32)
+
+#define CXL_SPARING_SET_ATTR(attrb, data_type) \
+ static int cxl_mem_sparing_set_##attrb(struct device *dev, \
+ void *drv_data, data_type val) \
+ { \
+ struct cxl_mem_sparing_context *ctx = drv_data; \
+ \
+ ctx->attrb = val; \
+ \
+ return 0; \
+ }
+CXL_SPARING_SET_ATTR(nibble_mask, u32)
+CXL_SPARING_SET_ATTR(bank_group, u32)
+CXL_SPARING_SET_ATTR(bank, u32)
+CXL_SPARING_SET_ATTR(rank, u32)
+CXL_SPARING_SET_ATTR(row, u32)
+CXL_SPARING_SET_ATTR(column, u32)
+CXL_SPARING_SET_ATTR(channel, u32)
+CXL_SPARING_SET_ATTR(sub_channel, u32)
+
+static int cxl_mem_sparing_set_persist_mode(struct device *dev, void *drv_data,
+ bool persist_mode)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+
+ if ((persist_mode && ctx->cap_hard_sparing) ||
+ (!persist_mode && ctx->cap_soft_sparing))
+ ctx->persist_mode = persist_mode;
+ else
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static int cxl_get_mem_sparing_safe_when_in_use(struct device *dev,
+ void *drv_data, bool *safe)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+
+ *safe = ctx->cap_safe_when_in_use;
+
+ return 0;
+}
+
+static int cxl_mem_sparing_get_min_dpa(struct device *dev, void *drv_data,
+ u64 *min_dpa)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+ struct cxl_memdev *cxlmd = ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *min_dpa = cxlds->dpa_res.start;
+
+ return 0;
+}
+
+static int cxl_mem_sparing_get_max_dpa(struct device *dev, void *drv_data,
+ u64 *max_dpa)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+ struct cxl_memdev *cxlmd = ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *max_dpa = cxlds->dpa_res.end;
+
+ return 0;
+}
+
+static int cxl_mem_sparing_set_dpa(struct device *dev, void *drv_data, u64 dpa)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+ struct cxl_memdev *cxlmd = ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa))
+ return -EINVAL;
+
+ ctx->dpa = dpa;
+
+ return 0;
+}
+
+static int cxl_do_mem_sparing(struct device *dev, void *drv_data, u32 val)
+{
+ struct cxl_mem_sparing_context *ctx = drv_data;
+
+ if (val != EDAC_DO_MEM_REPAIR)
+ return -EINVAL;
+
+ return cxl_mem_perform_sparing(dev, ctx);
+}
+
+#define RANK_OPS \
+ .get_repair_type = cxl_mem_sparing_get_repair_type, \
+ .get_persist_mode = cxl_mem_sparing_get_persist_mode, \
+ .set_persist_mode = cxl_mem_sparing_set_persist_mode, \
+ .get_repair_safe_when_in_use = cxl_get_mem_sparing_safe_when_in_use, \
+ .get_min_dpa = cxl_mem_sparing_get_min_dpa, \
+ .get_max_dpa = cxl_mem_sparing_get_max_dpa, \
+ .get_dpa = cxl_mem_sparing_get_dpa, \
+ .set_dpa = cxl_mem_sparing_set_dpa, \
+ .get_nibble_mask = cxl_mem_sparing_get_nibble_mask, \
+ .set_nibble_mask = cxl_mem_sparing_set_nibble_mask, \
+ .get_rank = cxl_mem_sparing_get_rank, \
+ .set_rank = cxl_mem_sparing_set_rank, \
+ .get_channel = cxl_mem_sparing_get_channel, \
+ .set_channel = cxl_mem_sparing_set_channel, \
+ .do_repair = cxl_do_mem_sparing
+
+#define BANK_OPS \
+ RANK_OPS, .get_bank_group = cxl_mem_sparing_get_bank_group, \
+ .set_bank_group = cxl_mem_sparing_set_bank_group, \
+ .get_bank = cxl_mem_sparing_get_bank, \
+ .set_bank = cxl_mem_sparing_set_bank
+
+#define ROW_OPS \
+ BANK_OPS, .get_row = cxl_mem_sparing_get_row, \
+ .set_row = cxl_mem_sparing_set_row
+
+#define CACHELINE_OPS \
+ ROW_OPS, .get_column = cxl_mem_sparing_get_column, \
+ .set_column = cxl_mem_sparing_set_column, \
+ .get_sub_channel = cxl_mem_sparing_get_sub_channel, \
+ .set_sub_channel = cxl_mem_sparing_set_sub_channel
+
+static const struct edac_mem_repair_ops cxl_rank_sparing_ops = {
+ RANK_OPS,
+};
+
+static const struct edac_mem_repair_ops cxl_bank_sparing_ops = {
+ BANK_OPS,
+};
+
+static const struct edac_mem_repair_ops cxl_row_sparing_ops = {
+ ROW_OPS,
+};
+
+static const struct edac_mem_repair_ops cxl_cacheline_sparing_ops = {
+ CACHELINE_OPS,
+};
+
+struct cxl_mem_sparing_desc {
+ const uuid_t repair_uuid;
+ enum edac_mem_repair_type repair_type;
+ const struct edac_mem_repair_ops *repair_ops;
+};
+
+static const struct cxl_mem_sparing_desc mem_sparing_desc[] = {
+ {
+ .repair_uuid = CXL_FEAT_CACHELINE_SPARING_UUID,
+ .repair_type = EDAC_REPAIR_CACHELINE_SPARING,
+ .repair_ops = &cxl_cacheline_sparing_ops,
+ },
+ {
+ .repair_uuid = CXL_FEAT_ROW_SPARING_UUID,
+ .repair_type = EDAC_REPAIR_ROW_SPARING,
+ .repair_ops = &cxl_row_sparing_ops,
+ },
+ {
+ .repair_uuid = CXL_FEAT_BANK_SPARING_UUID,
+ .repair_type = EDAC_REPAIR_BANK_SPARING,
+ .repair_ops = &cxl_bank_sparing_ops,
+ },
+ {
+ .repair_uuid = CXL_FEAT_RANK_SPARING_UUID,
+ .repair_type = EDAC_REPAIR_RANK_SPARING,
+ .repair_ops = &cxl_rank_sparing_ops,
+ },
+};
+
+static int cxl_memdev_sparing_init(struct cxl_memdev *cxlmd,
+ struct edac_dev_feature *ras_feature,
+ const struct cxl_mem_sparing_desc *desc,
+ u8 repair_inst)
+{
+ struct cxl_mem_sparing_context *cxl_sparing_ctx;
+ struct cxl_feat_entry *feat_entry;
+ int ret;
+
+ feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds),
+ &desc->repair_uuid);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ cxl_sparing_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_sparing_ctx),
+ GFP_KERNEL);
+ if (!cxl_sparing_ctx)
+ return -ENOMEM;
+
+ *cxl_sparing_ctx = (struct cxl_mem_sparing_context){
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .cxlmd = cxlmd,
+ .repair_type = desc->repair_type,
+ .instance = repair_inst++,
+ };
+ uuid_copy(&cxl_sparing_ctx->repair_uuid, &desc->repair_uuid);
+
+ ret = cxl_mem_sparing_get_attrbs(cxl_sparing_ctx);
+ if (ret)
+ return ret;
+
+ if ((cxl_sparing_ctx->cap_soft_sparing &&
+ cxl_sparing_ctx->cap_hard_sparing) ||
+ cxl_sparing_ctx->cap_soft_sparing)
+ cxl_sparing_ctx->persist_mode = 0;
+ else if (cxl_sparing_ctx->cap_hard_sparing)
+ cxl_sparing_ctx->persist_mode = 1;
+ else
+ return -EOPNOTSUPP;
+
+ ras_feature->ft_type = RAS_FEAT_MEM_REPAIR;
+ ras_feature->instance = cxl_sparing_ctx->instance;
+ ras_feature->mem_repair_ops = desc->repair_ops;
+ ras_feature->ctx = cxl_sparing_ctx;
+
+ return 0;
+}
+
+/*
+ * CXL memory soft PPR & hard PPR control
+ */
+struct cxl_ppr_context {
+ uuid_t repair_uuid;
+ u8 instance;
+ u16 get_feat_size;
+ u16 set_feat_size;
+ u8 get_version;
+ u8 set_version;
+ u16 effects;
+ u8 op_class;
+ u8 op_subclass;
+ bool cap_dpa;
+ bool cap_nib_mask;
+ bool media_accessible;
+ bool data_retained;
+ struct cxl_memdev *cxlmd;
+ enum edac_mem_repair_type repair_type;
+ bool persist_mode;
+ u64 dpa;
+ u32 nibble_mask;
+};
+
+/*
+ * See CXL rev 3.2 @8.2.10.7.2.1 Table 8-128 sPPR Feature Readable Attributes
+ *
+ * See CXL rev 3.2 @8.2.10.7.2.2 Table 8-131 hPPR Feature Readable Attributes
+ */
+
+#define CXL_PPR_OP_CAP_DEVICE_INITIATED BIT(0)
+#define CXL_PPR_OP_MODE_DEV_INITIATED BIT(0)
+
+#define CXL_PPR_FLAG_DPA_SUPPORT_MASK BIT(0)
+#define CXL_PPR_FLAG_NIB_SUPPORT_MASK BIT(1)
+#define CXL_PPR_FLAG_MEM_SPARING_EV_REC_SUPPORT_MASK BIT(2)
+#define CXL_PPR_FLAG_DEV_INITED_PPR_AT_BOOT_CAP_MASK BIT(3)
+
+#define CXL_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK BIT(0)
+#define CXL_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK BIT(2)
+
+#define CXL_PPR_SPARING_EV_REC_EN_MASK BIT(0)
+#define CXL_PPR_DEV_INITED_PPR_AT_BOOT_EN_MASK BIT(1)
+
+#define CXL_PPR_GET_CAP_DPA(flags) \
+ FIELD_GET(CXL_PPR_FLAG_DPA_SUPPORT_MASK, flags)
+#define CXL_PPR_GET_CAP_NIB_MASK(flags) \
+ FIELD_GET(CXL_PPR_FLAG_NIB_SUPPORT_MASK, flags)
+#define CXL_PPR_GET_MEDIA_ACCESSIBLE(restriction_flags) \
+ (FIELD_GET(CXL_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK, \
+ restriction_flags) ^ 1)
+#define CXL_PPR_GET_DATA_RETAINED(restriction_flags) \
+ (FIELD_GET(CXL_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK, \
+ restriction_flags) ^ 1)
+
+struct cxl_memdev_ppr_rd_attrbs {
+ struct cxl_memdev_repair_rd_attrbs_hdr hdr;
+ u8 ppr_flags;
+ __le16 restriction_flags;
+ u8 ppr_op_mode;
+} __packed;
+
+/*
+ * See CXL rev 3.2 @8.2.10.7.1.2 Table 8-118 sPPR Maintenance Input Payload
+ *
+ * See CXL rev 3.2 @8.2.10.7.1.3 Table 8-119 hPPR Maintenance Input Payload
+ */
+struct cxl_memdev_ppr_maintenance_attrbs {
+ u8 flags;
+ __le64 dpa;
+ u8 nibble_mask[3];
+} __packed;
+
+static int cxl_mem_ppr_get_attrbs(struct cxl_ppr_context *cxl_ppr_ctx)
+{
+ size_t rd_data_size = sizeof(struct cxl_memdev_ppr_rd_attrbs);
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ u16 restriction_flags;
+ size_t data_size;
+ u16 return_code;
+
+ struct cxl_memdev_ppr_rd_attrbs *rd_attrbs __free(kfree) =
+ kmalloc(rd_data_size, GFP_KERNEL);
+ if (!rd_attrbs)
+ return -ENOMEM;
+
+ data_size = cxl_get_feature(cxl_mbox, &cxl_ppr_ctx->repair_uuid,
+ CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs,
+ rd_data_size, 0, &return_code);
+ if (!data_size)
+ return -EIO;
+
+ cxl_ppr_ctx->op_class = rd_attrbs->hdr.op_class;
+ cxl_ppr_ctx->op_subclass = rd_attrbs->hdr.op_subclass;
+ cxl_ppr_ctx->cap_dpa = CXL_PPR_GET_CAP_DPA(rd_attrbs->ppr_flags);
+ cxl_ppr_ctx->cap_nib_mask =
+ CXL_PPR_GET_CAP_NIB_MASK(rd_attrbs->ppr_flags);
+
+ restriction_flags = le16_to_cpu(rd_attrbs->restriction_flags);
+ cxl_ppr_ctx->media_accessible =
+ CXL_PPR_GET_MEDIA_ACCESSIBLE(restriction_flags);
+ cxl_ppr_ctx->data_retained =
+ CXL_PPR_GET_DATA_RETAINED(restriction_flags);
+
+ return 0;
+}
+
+static int cxl_mem_perform_ppr(struct cxl_ppr_context *cxl_ppr_ctx)
+{
+ struct cxl_memdev_ppr_maintenance_attrbs maintenance_attrbs;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_mem_repair_attrbs attrbs = { 0 };
+ int ret;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return ret;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return ret;
+
+ if (!cxl_ppr_ctx->media_accessible || !cxl_ppr_ctx->data_retained) {
+ /* Memory to repair must be offline */
+ if (cxl_is_memdev_memory_online(cxlmd))
+ return -EBUSY;
+ } else {
+ if (cxl_is_memdev_memory_online(cxlmd)) {
+ /* Check memory to repair is from the current boot */
+ attrbs.repair_type = CXL_PPR;
+ attrbs.dpa = cxl_ppr_ctx->dpa;
+ attrbs.nibble_mask = cxl_ppr_ctx->nibble_mask;
+ if (!cxl_find_rec_dram(cxlmd, &attrbs) &&
+ !cxl_find_rec_gen_media(cxlmd, &attrbs))
+ return -EINVAL;
+ }
+ }
+
+ memset(&maintenance_attrbs, 0, sizeof(maintenance_attrbs));
+ maintenance_attrbs.flags = 0;
+ maintenance_attrbs.dpa = cpu_to_le64(cxl_ppr_ctx->dpa);
+ put_unaligned_le24(cxl_ppr_ctx->nibble_mask,
+ maintenance_attrbs.nibble_mask);
+
+ return cxl_perform_maintenance(&cxlmd->cxlds->cxl_mbox,
+ cxl_ppr_ctx->op_class,
+ cxl_ppr_ctx->op_subclass,
+ &maintenance_attrbs,
+ sizeof(maintenance_attrbs));
+}
+
+static int cxl_ppr_get_repair_type(struct device *dev, void *drv_data,
+ const char **repair_type)
+{
+ *repair_type = edac_repair_type[EDAC_REPAIR_PPR];
+
+ return 0;
+}
+
+static int cxl_ppr_get_persist_mode(struct device *dev, void *drv_data,
+ bool *persist_mode)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *persist_mode = cxl_ppr_ctx->persist_mode;
+
+ return 0;
+}
+
+static int cxl_get_ppr_safe_when_in_use(struct device *dev, void *drv_data,
+ bool *safe)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *safe = cxl_ppr_ctx->media_accessible & cxl_ppr_ctx->data_retained;
+
+ return 0;
+}
+
+static int cxl_ppr_get_min_dpa(struct device *dev, void *drv_data, u64 *min_dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *min_dpa = cxlds->dpa_res.start;
+
+ return 0;
+}
+
+static int cxl_ppr_get_max_dpa(struct device *dev, void *drv_data, u64 *max_dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ *max_dpa = cxlds->dpa_res.end;
+
+ return 0;
+}
+
+static int cxl_ppr_get_dpa(struct device *dev, void *drv_data, u64 *dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *dpa = cxl_ppr_ctx->dpa;
+
+ return 0;
+}
+
+static int cxl_ppr_set_dpa(struct device *dev, void *drv_data, u64 dpa)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa))
+ return -EINVAL;
+
+ cxl_ppr_ctx->dpa = dpa;
+
+ return 0;
+}
+
+static int cxl_ppr_get_nibble_mask(struct device *dev, void *drv_data,
+ u32 *nibble_mask)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ *nibble_mask = cxl_ppr_ctx->nibble_mask;
+
+ return 0;
+}
+
+static int cxl_ppr_set_nibble_mask(struct device *dev, void *drv_data,
+ u32 nibble_mask)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+
+ cxl_ppr_ctx->nibble_mask = nibble_mask;
+
+ return 0;
+}
+
+static int cxl_do_ppr(struct device *dev, void *drv_data, u32 val)
+{
+ struct cxl_ppr_context *cxl_ppr_ctx = drv_data;
+ struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ if (val != EDAC_DO_MEM_REPAIR ||
+ !cxl_resource_contains_addr(&cxlds->dpa_res, cxl_ppr_ctx->dpa))
+ return -EINVAL;
+
+ return cxl_mem_perform_ppr(cxl_ppr_ctx);
+}
+
+static const struct edac_mem_repair_ops cxl_sppr_ops = {
+ .get_repair_type = cxl_ppr_get_repair_type,
+ .get_persist_mode = cxl_ppr_get_persist_mode,
+ .get_repair_safe_when_in_use = cxl_get_ppr_safe_when_in_use,
+ .get_min_dpa = cxl_ppr_get_min_dpa,
+ .get_max_dpa = cxl_ppr_get_max_dpa,
+ .get_dpa = cxl_ppr_get_dpa,
+ .set_dpa = cxl_ppr_set_dpa,
+ .get_nibble_mask = cxl_ppr_get_nibble_mask,
+ .set_nibble_mask = cxl_ppr_set_nibble_mask,
+ .do_repair = cxl_do_ppr,
+};
+
+static int cxl_memdev_soft_ppr_init(struct cxl_memdev *cxlmd,
+ struct edac_dev_feature *ras_feature,
+ u8 repair_inst)
+{
+ struct cxl_ppr_context *cxl_sppr_ctx;
+ struct cxl_feat_entry *feat_entry;
+ int ret;
+
+ feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds),
+ &CXL_FEAT_SPPR_UUID);
+ if (IS_ERR(feat_entry))
+ return -EOPNOTSUPP;
+
+ if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE))
+ return -EOPNOTSUPP;
+
+ cxl_sppr_ctx =
+ devm_kzalloc(&cxlmd->dev, sizeof(*cxl_sppr_ctx), GFP_KERNEL);
+ if (!cxl_sppr_ctx)
+ return -ENOMEM;
+
+ *cxl_sppr_ctx = (struct cxl_ppr_context){
+ .get_feat_size = le16_to_cpu(feat_entry->get_feat_size),
+ .set_feat_size = le16_to_cpu(feat_entry->set_feat_size),
+ .get_version = feat_entry->get_feat_ver,
+ .set_version = feat_entry->set_feat_ver,
+ .effects = le16_to_cpu(feat_entry->effects),
+ .cxlmd = cxlmd,
+ .repair_type = EDAC_REPAIR_PPR,
+ .persist_mode = 0,
+ .instance = repair_inst,
+ };
+ uuid_copy(&cxl_sppr_ctx->repair_uuid, &CXL_FEAT_SPPR_UUID);
+
+ ret = cxl_mem_ppr_get_attrbs(cxl_sppr_ctx);
+ if (ret)
+ return ret;
+
+ ras_feature->ft_type = RAS_FEAT_MEM_REPAIR;
+ ras_feature->instance = cxl_sppr_ctx->instance;
+ ras_feature->mem_repair_ops = &cxl_sppr_ops;
+ ras_feature->ctx = cxl_sppr_ctx;
+
+ return 0;
+}
+
+int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
+{
+ struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES];
+ int num_ras_features = 0;
+ u8 repair_inst = 0;
+ int rc;
+
+ if (IS_ENABLED(CONFIG_CXL_EDAC_SCRUB)) {
+ rc = cxl_memdev_scrub_init(cxlmd, &ras_features[num_ras_features], 0);
+ if (rc < 0 && rc != -EOPNOTSUPP)
+ return rc;
+
+ if (rc != -EOPNOTSUPP)
+ num_ras_features++;
+ }
+
+ if (IS_ENABLED(CONFIG_CXL_EDAC_ECS)) {
+ rc = cxl_memdev_ecs_init(cxlmd, &ras_features[num_ras_features]);
+ if (rc < 0 && rc != -EOPNOTSUPP)
+ return rc;
+
+ if (rc != -EOPNOTSUPP)
+ num_ras_features++;
+ }
+
+ if (IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR)) {
+ for (int i = 0; i < CXL_MEM_SPARING_MAX; i++) {
+ rc = cxl_memdev_sparing_init(cxlmd,
+ &ras_features[num_ras_features],
+ &mem_sparing_desc[i], repair_inst);
+ if (rc == -EOPNOTSUPP)
+ continue;
+ if (rc < 0)
+ return rc;
+
+ repair_inst++;
+ num_ras_features++;
+ }
+
+ rc = cxl_memdev_soft_ppr_init(cxlmd, &ras_features[num_ras_features],
+ repair_inst);
+ if (rc < 0 && rc != -EOPNOTSUPP)
+ return rc;
+
+ if (rc != -EOPNOTSUPP) {
+ repair_inst++;
+ num_ras_features++;
+ }
+
+ if (repair_inst) {
+ struct cxl_mem_err_rec *array_rec =
+ devm_kzalloc(&cxlmd->dev, sizeof(*array_rec),
+ GFP_KERNEL);
+ if (!array_rec)
+ return -ENOMEM;
+
+ xa_init(&array_rec->rec_gen_media);
+ xa_init(&array_rec->rec_dram);
+ cxlmd->err_rec_array = array_rec;
+ }
+ }
+
+ if (!num_ras_features)
+ return -EINVAL;
+
+ char *cxl_dev_name __free(kfree) =
+ kasprintf(GFP_KERNEL, "cxl_%s", dev_name(&cxlmd->dev));
+ if (!cxl_dev_name)
+ return -ENOMEM;
+
+ return edac_dev_register(&cxlmd->dev, cxl_dev_name, NULL,
+ num_ras_features, ras_features);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_register, "CXL");
+
+int devm_cxl_region_edac_register(struct cxl_region *cxlr)
+{
+ struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES];
+ int num_ras_features = 0;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_CXL_EDAC_SCRUB))
+ return 0;
+
+ rc = cxl_region_scrub_init(cxlr, &ras_features[num_ras_features], 0);
+ if (rc < 0)
+ return rc;
+
+ num_ras_features++;
+
+ char *cxl_dev_name __free(kfree) =
+ kasprintf(GFP_KERNEL, "cxl_%s", dev_name(&cxlr->dev));
+ if (!cxl_dev_name)
+ return -ENOMEM;
+
+ return edac_dev_register(&cxlr->dev, cxl_dev_name, NULL,
+ num_ras_features, ras_features);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_region_edac_register, "CXL");
+
+void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd)
+{
+ struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array;
+ struct cxl_event_gen_media *rec_gen_media;
+ struct cxl_event_dram *rec_dram;
+ unsigned long index;
+
+ if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec)
+ return;
+
+ xa_for_each(&array_rec->rec_dram, index, rec_dram)
+ kfree(rec_dram);
+ xa_destroy(&array_rec->rec_dram);
+
+ xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media)
+ kfree(rec_gen_media);
+ xa_destroy(&array_rec->rec_gen_media);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_release, "CXL");
diff --git a/drivers/cxl/core/features.c b/drivers/cxl/core/features.c
new file mode 100644
index 000000000000..4bc484b46f43
--- /dev/null
+++ b/drivers/cxl/core/features.c
@@ -0,0 +1,706 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */
+#include <linux/fwctl.h>
+#include <linux/device.h>
+#include <cxl/mailbox.h>
+#include <cxl/features.h>
+#include <uapi/fwctl/cxl.h>
+#include "cxl.h"
+#include "core.h"
+#include "cxlmem.h"
+
+/**
+ * DOC: cxl features
+ *
+ * CXL Features:
+ * A CXL device that includes a mailbox supports commands that allows
+ * listing, getting, and setting of optionally defined features such
+ * as memory sparing or post package sparing. Vendors may define custom
+ * features for the device.
+ */
+
+/* All the features below are exclusive to the kernel */
+static const uuid_t cxl_exclusive_feats[] = {
+ CXL_FEAT_PATROL_SCRUB_UUID,
+ CXL_FEAT_ECS_UUID,
+ CXL_FEAT_SPPR_UUID,
+ CXL_FEAT_HPPR_UUID,
+ CXL_FEAT_CACHELINE_SPARING_UUID,
+ CXL_FEAT_ROW_SPARING_UUID,
+ CXL_FEAT_BANK_SPARING_UUID,
+ CXL_FEAT_RANK_SPARING_UUID,
+};
+
+static bool is_cxl_feature_exclusive_by_uuid(const uuid_t *uuid)
+{
+ for (int i = 0; i < ARRAY_SIZE(cxl_exclusive_feats); i++) {
+ if (uuid_equal(uuid, &cxl_exclusive_feats[i]))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_cxl_feature_exclusive(struct cxl_feat_entry *entry)
+{
+ return is_cxl_feature_exclusive_by_uuid(&entry->uuid);
+}
+
+struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds)
+{
+ return cxlds->cxlfs;
+}
+EXPORT_SYMBOL_NS_GPL(to_cxlfs, "CXL");
+
+static int cxl_get_supported_features_count(struct cxl_mailbox *cxl_mbox)
+{
+ struct cxl_mbox_get_sup_feats_out mbox_out;
+ struct cxl_mbox_get_sup_feats_in mbox_in;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ memset(&mbox_in, 0, sizeof(mbox_in));
+ mbox_in.count = cpu_to_le32(sizeof(mbox_out));
+ memset(&mbox_out, 0, sizeof(mbox_out));
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_SUPPORTED_FEATURES,
+ .size_in = sizeof(mbox_in),
+ .payload_in = &mbox_in,
+ .size_out = sizeof(mbox_out),
+ .payload_out = &mbox_out,
+ .min_out = sizeof(mbox_out),
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return rc;
+
+ return le16_to_cpu(mbox_out.supported_feats);
+}
+
+static struct cxl_feat_entries *
+get_supported_features(struct cxl_features_state *cxlfs)
+{
+ int remain_feats, max_size, max_feats, start, rc, hdr_size;
+ struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox;
+ int feat_size = sizeof(struct cxl_feat_entry);
+ struct cxl_mbox_get_sup_feats_in mbox_in;
+ struct cxl_feat_entry *entry;
+ struct cxl_mbox_cmd mbox_cmd;
+ int user_feats = 0;
+ int count;
+
+ count = cxl_get_supported_features_count(cxl_mbox);
+ if (count <= 0)
+ return NULL;
+
+ struct cxl_feat_entries *entries __free(kvfree) =
+ kvmalloc(struct_size(entries, ent, count), GFP_KERNEL);
+ if (!entries)
+ return NULL;
+
+ struct cxl_mbox_get_sup_feats_out *mbox_out __free(kvfree) =
+ kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!mbox_out)
+ return NULL;
+
+ hdr_size = struct_size(mbox_out, ents, 0);
+ max_size = cxl_mbox->payload_size - hdr_size;
+ /* max feat entries that can fit in mailbox max payload size */
+ max_feats = max_size / feat_size;
+ entry = entries->ent;
+
+ start = 0;
+ remain_feats = count;
+ do {
+ int retrieved, alloc_size, copy_feats;
+ int num_entries;
+
+ if (remain_feats > max_feats) {
+ alloc_size = struct_size(mbox_out, ents, max_feats);
+ remain_feats = remain_feats - max_feats;
+ copy_feats = max_feats;
+ } else {
+ alloc_size = struct_size(mbox_out, ents, remain_feats);
+ copy_feats = remain_feats;
+ remain_feats = 0;
+ }
+
+ memset(&mbox_in, 0, sizeof(mbox_in));
+ mbox_in.count = cpu_to_le32(alloc_size);
+ mbox_in.start_idx = cpu_to_le16(start);
+ memset(mbox_out, 0, alloc_size);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_SUPPORTED_FEATURES,
+ .size_in = sizeof(mbox_in),
+ .payload_in = &mbox_in,
+ .size_out = alloc_size,
+ .payload_out = mbox_out,
+ .min_out = hdr_size,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return NULL;
+
+ if (mbox_cmd.size_out <= hdr_size)
+ return NULL;
+
+ /*
+ * Make sure retrieved out buffer is multiple of feature
+ * entries.
+ */
+ retrieved = mbox_cmd.size_out - hdr_size;
+ if (retrieved % feat_size)
+ return NULL;
+
+ num_entries = le16_to_cpu(mbox_out->num_entries);
+ /*
+ * If the reported output entries * defined entry size !=
+ * retrieved output bytes, then the output package is incorrect.
+ */
+ if (num_entries * feat_size != retrieved)
+ return NULL;
+
+ memcpy(entry, mbox_out->ents, retrieved);
+ for (int i = 0; i < num_entries; i++) {
+ if (!is_cxl_feature_exclusive(entry + i))
+ user_feats++;
+ }
+ entry += num_entries;
+ /*
+ * If the number of output entries is less than expected, add the
+ * remaining entries to the next batch.
+ */
+ remain_feats += copy_feats - num_entries;
+ start += num_entries;
+ } while (remain_feats);
+
+ entries->num_features = count;
+ entries->num_user_features = user_feats;
+
+ return no_free_ptr(entries);
+}
+
+static void free_cxlfs(void *_cxlfs)
+{
+ struct cxl_features_state *cxlfs = _cxlfs;
+ struct cxl_dev_state *cxlds = cxlfs->cxlds;
+
+ cxlds->cxlfs = NULL;
+ kvfree(cxlfs->entries);
+ kfree(cxlfs);
+}
+
+/**
+ * devm_cxl_setup_features() - Allocate and initialize features context
+ * @cxlds: CXL device context
+ *
+ * Return 0 on success or -errno on failure.
+ */
+int devm_cxl_setup_features(struct cxl_dev_state *cxlds)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+
+ if (cxl_mbox->feat_cap < CXL_FEATURES_RO)
+ return -ENODEV;
+
+ struct cxl_features_state *cxlfs __free(kfree) =
+ kzalloc(sizeof(*cxlfs), GFP_KERNEL);
+ if (!cxlfs)
+ return -ENOMEM;
+
+ cxlfs->cxlds = cxlds;
+
+ cxlfs->entries = get_supported_features(cxlfs);
+ if (!cxlfs->entries)
+ return -ENOMEM;
+
+ cxlds->cxlfs = cxlfs;
+
+ return devm_add_action_or_reset(cxlds->dev, free_cxlfs, no_free_ptr(cxlfs));
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_features, "CXL");
+
+size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid,
+ enum cxl_get_feat_selection selection,
+ void *feat_out, size_t feat_out_size, u16 offset,
+ u16 *return_code)
+{
+ size_t data_to_rd_size, size_out;
+ struct cxl_mbox_get_feat_in pi;
+ struct cxl_mbox_cmd mbox_cmd;
+ size_t data_rcvd_size = 0;
+ int rc;
+
+ if (return_code)
+ *return_code = CXL_MBOX_CMD_RC_INPUT;
+
+ if (!feat_out || !feat_out_size)
+ return 0;
+
+ size_out = min(feat_out_size, cxl_mbox->payload_size);
+ uuid_copy(&pi.uuid, feat_uuid);
+ pi.selection = selection;
+ do {
+ data_to_rd_size = min(feat_out_size - data_rcvd_size,
+ cxl_mbox->payload_size);
+ pi.offset = cpu_to_le16(offset + data_rcvd_size);
+ pi.count = cpu_to_le16(data_to_rd_size);
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_FEATURE,
+ .size_in = sizeof(pi),
+ .payload_in = &pi,
+ .size_out = size_out,
+ .payload_out = feat_out + data_rcvd_size,
+ .min_out = data_to_rd_size,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0 || !mbox_cmd.size_out) {
+ if (return_code)
+ *return_code = mbox_cmd.return_code;
+ return 0;
+ }
+ data_rcvd_size += mbox_cmd.size_out;
+ } while (data_rcvd_size < feat_out_size);
+
+ if (return_code)
+ *return_code = CXL_MBOX_CMD_RC_SUCCESS;
+
+ return data_rcvd_size;
+}
+
+/*
+ * FEAT_DATA_MIN_PAYLOAD_SIZE - min extra number of bytes should be
+ * available in the mailbox for storing the actual feature data so that
+ * the feature data transfer would work as expected.
+ */
+#define FEAT_DATA_MIN_PAYLOAD_SIZE 10
+int cxl_set_feature(struct cxl_mailbox *cxl_mbox,
+ const uuid_t *feat_uuid, u8 feat_version,
+ const void *feat_data, size_t feat_data_size,
+ u32 feat_flag, u16 offset, u16 *return_code)
+{
+ size_t data_in_size, data_sent_size = 0;
+ struct cxl_mbox_cmd mbox_cmd;
+ size_t hdr_size;
+
+ if (return_code)
+ *return_code = CXL_MBOX_CMD_RC_INPUT;
+
+ struct cxl_mbox_set_feat_in *pi __free(kfree) =
+ kzalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!pi)
+ return -ENOMEM;
+
+ uuid_copy(&pi->uuid, feat_uuid);
+ pi->version = feat_version;
+ feat_flag &= ~CXL_SET_FEAT_FLAG_DATA_TRANSFER_MASK;
+ feat_flag |= CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET;
+ hdr_size = sizeof(pi->hdr);
+ /*
+ * Check minimum mbox payload size is available for
+ * the feature data transfer.
+ */
+ if (hdr_size + FEAT_DATA_MIN_PAYLOAD_SIZE > cxl_mbox->payload_size)
+ return -ENOMEM;
+
+ if (hdr_size + feat_data_size <= cxl_mbox->payload_size) {
+ pi->flags = cpu_to_le32(feat_flag |
+ CXL_SET_FEAT_FLAG_FULL_DATA_TRANSFER);
+ data_in_size = feat_data_size;
+ } else {
+ pi->flags = cpu_to_le32(feat_flag |
+ CXL_SET_FEAT_FLAG_INITIATE_DATA_TRANSFER);
+ data_in_size = cxl_mbox->payload_size - hdr_size;
+ }
+
+ do {
+ int rc;
+
+ pi->offset = cpu_to_le16(offset + data_sent_size);
+ memcpy(pi->feat_data, feat_data + data_sent_size, data_in_size);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_FEATURE,
+ .size_in = hdr_size + data_in_size,
+ .payload_in = pi,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0) {
+ if (return_code)
+ *return_code = mbox_cmd.return_code;
+ return rc;
+ }
+
+ data_sent_size += data_in_size;
+ if (data_sent_size >= feat_data_size) {
+ if (return_code)
+ *return_code = CXL_MBOX_CMD_RC_SUCCESS;
+ return 0;
+ }
+
+ if ((feat_data_size - data_sent_size) <= (cxl_mbox->payload_size - hdr_size)) {
+ data_in_size = feat_data_size - data_sent_size;
+ pi->flags = cpu_to_le32(feat_flag |
+ CXL_SET_FEAT_FLAG_FINISH_DATA_TRANSFER);
+ } else {
+ pi->flags = cpu_to_le32(feat_flag |
+ CXL_SET_FEAT_FLAG_CONTINUE_DATA_TRANSFER);
+ }
+ } while (true);
+}
+
+/* FWCTL support */
+
+static inline struct cxl_memdev *fwctl_to_memdev(struct fwctl_device *fwctl_dev)
+{
+ return to_cxl_memdev(fwctl_dev->dev.parent);
+}
+
+static int cxlctl_open_uctx(struct fwctl_uctx *uctx)
+{
+ return 0;
+}
+
+static void cxlctl_close_uctx(struct fwctl_uctx *uctx)
+{
+}
+
+struct cxl_feat_entry *
+cxl_feature_info(struct cxl_features_state *cxlfs,
+ const uuid_t *uuid)
+{
+ struct cxl_feat_entry *feat;
+
+ if (!cxlfs || !cxlfs->entries)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ for (int i = 0; i < cxlfs->entries->num_features; i++) {
+ feat = &cxlfs->entries->ent[i];
+ if (uuid_equal(uuid, &feat->uuid))
+ return feat;
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static void *cxlctl_get_supported_features(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ size_t *out_len)
+{
+ const struct cxl_mbox_get_sup_feats_in *feat_in;
+ struct cxl_mbox_get_sup_feats_out *feat_out;
+ struct cxl_feat_entry *pos;
+ size_t out_size;
+ int requested;
+ u32 count;
+ u16 start;
+ int i;
+
+ if (rpc_in->op_size != sizeof(*feat_in))
+ return ERR_PTR(-EINVAL);
+
+ feat_in = &rpc_in->get_sup_feats_in;
+ count = le32_to_cpu(feat_in->count);
+ start = le16_to_cpu(feat_in->start_idx);
+ requested = count / sizeof(*pos);
+
+ /*
+ * Make sure that the total requested number of entries is not greater
+ * than the total number of supported features allowed for userspace.
+ */
+ if (start >= cxlfs->entries->num_features)
+ return ERR_PTR(-EINVAL);
+
+ requested = min_t(int, requested, cxlfs->entries->num_features - start);
+
+ out_size = sizeof(struct fwctl_rpc_cxl_out) +
+ struct_size(feat_out, ents, requested);
+
+ struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) =
+ kvzalloc(out_size, GFP_KERNEL);
+ if (!rpc_out)
+ return ERR_PTR(-ENOMEM);
+
+ rpc_out->size = struct_size(feat_out, ents, requested);
+ feat_out = &rpc_out->get_sup_feats_out;
+
+ for (i = start, pos = &feat_out->ents[0];
+ i < cxlfs->entries->num_features; i++, pos++) {
+ if (i - start == requested)
+ break;
+
+ memcpy(pos, &cxlfs->entries->ent[i], sizeof(*pos));
+ /*
+ * If the feature is exclusive, set the set_feat_size to 0 to
+ * indicate that the feature is not changeable.
+ */
+ if (is_cxl_feature_exclusive(pos)) {
+ u32 flags;
+
+ pos->set_feat_size = 0;
+ flags = le32_to_cpu(pos->flags);
+ flags &= ~CXL_FEATURE_F_CHANGEABLE;
+ pos->flags = cpu_to_le32(flags);
+ }
+ }
+
+ feat_out->num_entries = cpu_to_le16(requested);
+ feat_out->supported_feats = cpu_to_le16(cxlfs->entries->num_features);
+ rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS;
+ *out_len = out_size;
+
+ return no_free_ptr(rpc_out);
+}
+
+static void *cxlctl_get_feature(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ size_t *out_len)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox;
+ const struct cxl_mbox_get_feat_in *feat_in;
+ u16 offset, count, return_code;
+ size_t out_size = *out_len;
+
+ if (rpc_in->op_size != sizeof(*feat_in))
+ return ERR_PTR(-EINVAL);
+
+ feat_in = &rpc_in->get_feat_in;
+ offset = le16_to_cpu(feat_in->offset);
+ count = le16_to_cpu(feat_in->count);
+
+ if (!count)
+ return ERR_PTR(-EINVAL);
+
+ struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) =
+ kvzalloc(out_size, GFP_KERNEL);
+ if (!rpc_out)
+ return ERR_PTR(-ENOMEM);
+
+ out_size = cxl_get_feature(cxl_mbox, &feat_in->uuid,
+ feat_in->selection, rpc_out->payload,
+ count, offset, &return_code);
+ *out_len = sizeof(struct fwctl_rpc_cxl_out);
+ if (!out_size) {
+ rpc_out->size = 0;
+ rpc_out->retval = return_code;
+ return no_free_ptr(rpc_out);
+ }
+
+ rpc_out->size = out_size;
+ rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS;
+ *out_len += out_size;
+
+ return no_free_ptr(rpc_out);
+}
+
+static void *cxlctl_set_feature(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ size_t *out_len)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox;
+ const struct cxl_mbox_set_feat_in *feat_in;
+ size_t out_size, data_size;
+ u16 offset, return_code;
+ u32 flags;
+ int rc;
+
+ if (rpc_in->op_size <= sizeof(feat_in->hdr))
+ return ERR_PTR(-EINVAL);
+
+ feat_in = &rpc_in->set_feat_in;
+
+ if (is_cxl_feature_exclusive_by_uuid(&feat_in->uuid))
+ return ERR_PTR(-EPERM);
+
+ offset = le16_to_cpu(feat_in->offset);
+ flags = le32_to_cpu(feat_in->flags);
+ out_size = *out_len;
+
+ struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) =
+ kvzalloc(out_size, GFP_KERNEL);
+ if (!rpc_out)
+ return ERR_PTR(-ENOMEM);
+
+ rpc_out->size = 0;
+
+ data_size = rpc_in->op_size - sizeof(feat_in->hdr);
+ rc = cxl_set_feature(cxl_mbox, &feat_in->uuid,
+ feat_in->version, feat_in->feat_data,
+ data_size, flags, offset, &return_code);
+ *out_len = sizeof(*rpc_out);
+ if (rc) {
+ rpc_out->retval = return_code;
+ return no_free_ptr(rpc_out);
+ }
+
+ rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS;
+
+ return no_free_ptr(rpc_out);
+}
+
+static bool cxlctl_validate_set_features(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ enum fwctl_rpc_scope scope)
+{
+ u16 effects, imm_mask, reset_mask;
+ struct cxl_feat_entry *feat;
+ u32 flags;
+
+ if (rpc_in->op_size < sizeof(uuid_t))
+ return false;
+
+ feat = cxl_feature_info(cxlfs, &rpc_in->set_feat_in.uuid);
+ if (IS_ERR(feat))
+ return false;
+
+ /* Ensure that the attribute is changeable */
+ flags = le32_to_cpu(feat->flags);
+ if (!(flags & CXL_FEATURE_F_CHANGEABLE))
+ return false;
+
+ effects = le16_to_cpu(feat->effects);
+
+ /*
+ * Reserved bits are set, rejecting since the effects is not
+ * comprehended by the driver.
+ */
+ if (effects & CXL_CMD_EFFECTS_RESERVED) {
+ dev_warn_once(cxlfs->cxlds->dev,
+ "Reserved bits set in the Feature effects field!\n");
+ return false;
+ }
+
+ /* Currently no user background command support */
+ if (effects & CXL_CMD_BACKGROUND)
+ return false;
+
+ /* Effects cause immediate change, highest security scope is needed */
+ imm_mask = CXL_CMD_CONFIG_CHANGE_IMMEDIATE |
+ CXL_CMD_DATA_CHANGE_IMMEDIATE |
+ CXL_CMD_POLICY_CHANGE_IMMEDIATE |
+ CXL_CMD_LOG_CHANGE_IMMEDIATE;
+
+ reset_mask = CXL_CMD_CONFIG_CHANGE_COLD_RESET |
+ CXL_CMD_CONFIG_CHANGE_CONV_RESET |
+ CXL_CMD_CONFIG_CHANGE_CXL_RESET;
+
+ /* If no immediate or reset effect set, The hardware has a bug */
+ if (!(effects & imm_mask) && !(effects & reset_mask))
+ return false;
+
+ /*
+ * If the Feature setting causes immediate configuration change
+ * then we need the full write permission policy.
+ */
+ if (effects & imm_mask && scope >= FWCTL_RPC_DEBUG_WRITE_FULL)
+ return true;
+
+ /*
+ * If the Feature setting only causes configuration change
+ * after a reset, then the lesser level of write permission
+ * policy is ok.
+ */
+ if (!(effects & imm_mask) && scope >= FWCTL_RPC_DEBUG_WRITE)
+ return true;
+
+ return false;
+}
+
+static bool cxlctl_validate_hw_command(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ enum fwctl_rpc_scope scope,
+ u16 opcode)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox;
+
+ switch (opcode) {
+ case CXL_MBOX_OP_GET_SUPPORTED_FEATURES:
+ case CXL_MBOX_OP_GET_FEATURE:
+ return cxl_mbox->feat_cap >= CXL_FEATURES_RO;
+ case CXL_MBOX_OP_SET_FEATURE:
+ if (cxl_mbox->feat_cap < CXL_FEATURES_RW)
+ return false;
+ return cxlctl_validate_set_features(cxlfs, rpc_in, scope);
+ default:
+ return false;
+ }
+}
+
+static void *cxlctl_handle_commands(struct cxl_features_state *cxlfs,
+ const struct fwctl_rpc_cxl *rpc_in,
+ size_t *out_len, u16 opcode)
+{
+ switch (opcode) {
+ case CXL_MBOX_OP_GET_SUPPORTED_FEATURES:
+ return cxlctl_get_supported_features(cxlfs, rpc_in, out_len);
+ case CXL_MBOX_OP_GET_FEATURE:
+ return cxlctl_get_feature(cxlfs, rpc_in, out_len);
+ case CXL_MBOX_OP_SET_FEATURE:
+ return cxlctl_set_feature(cxlfs, rpc_in, out_len);
+ default:
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+static void *cxlctl_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope,
+ void *in, size_t in_len, size_t *out_len)
+{
+ struct fwctl_device *fwctl_dev = uctx->fwctl;
+ struct cxl_memdev *cxlmd = fwctl_to_memdev(fwctl_dev);
+ struct cxl_features_state *cxlfs = to_cxlfs(cxlmd->cxlds);
+ const struct fwctl_rpc_cxl *rpc_in = in;
+ u16 opcode = rpc_in->opcode;
+
+ if (!cxlctl_validate_hw_command(cxlfs, rpc_in, scope, opcode))
+ return ERR_PTR(-EINVAL);
+
+ return cxlctl_handle_commands(cxlfs, rpc_in, out_len, opcode);
+}
+
+static const struct fwctl_ops cxlctl_ops = {
+ .device_type = FWCTL_DEVICE_TYPE_CXL,
+ .uctx_size = sizeof(struct fwctl_uctx),
+ .open_uctx = cxlctl_open_uctx,
+ .close_uctx = cxlctl_close_uctx,
+ .fw_rpc = cxlctl_fw_rpc,
+};
+
+DEFINE_FREE(free_fwctl_dev, struct fwctl_device *, if (_T) fwctl_put(_T))
+
+static void free_memdev_fwctl(void *_fwctl_dev)
+{
+ struct fwctl_device *fwctl_dev = _fwctl_dev;
+
+ fwctl_unregister(fwctl_dev);
+ fwctl_put(fwctl_dev);
+}
+
+int devm_cxl_setup_fwctl(struct device *host, struct cxl_memdev *cxlmd)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_features_state *cxlfs;
+ int rc;
+
+ cxlfs = to_cxlfs(cxlds);
+ if (!cxlfs)
+ return -ENODEV;
+
+ /* No need to setup FWCTL if there are no user allowed features found */
+ if (!cxlfs->entries->num_user_features)
+ return -ENODEV;
+
+ struct fwctl_device *fwctl_dev __free(free_fwctl_dev) =
+ _fwctl_alloc_device(&cxlmd->dev, &cxlctl_ops, sizeof(*fwctl_dev));
+ if (!fwctl_dev)
+ return -ENOMEM;
+
+ rc = fwctl_register(fwctl_dev);
+ if (rc)
+ return rc;
+
+ return devm_add_action_or_reset(host, free_memdev_fwctl,
+ no_free_ptr(fwctl_dev));
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fwctl, "CXL");
+
+MODULE_IMPORT_NS("FWCTL");
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
new file mode 100644
index 000000000000..1c5d2022c87a
--- /dev/null
+++ b/drivers/cxl/core/hdm.c
@@ -0,0 +1,1287 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/seq_file.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+
+#include "cxlmem.h"
+#include "core.h"
+
+/**
+ * DOC: cxl core hdm
+ *
+ * Compute Express Link Host Managed Device Memory, starting with the
+ * CXL 2.0 specification, is managed by an array of HDM Decoder register
+ * instances per CXL port and per CXL endpoint. Define common helpers
+ * for enumerating these registers and capabilities.
+ */
+
+struct cxl_rwsem cxl_rwsem = {
+ .region = __RWSEM_INITIALIZER(cxl_rwsem.region),
+ .dpa = __RWSEM_INITIALIZER(cxl_rwsem.dpa),
+};
+
+static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld)
+{
+ int rc;
+
+ rc = cxl_decoder_add_locked(cxld);
+ if (rc) {
+ put_device(&cxld->dev);
+ dev_err(&port->dev, "Failed to add decoder\n");
+ return rc;
+ }
+
+ rc = cxl_decoder_autoremove(&port->dev, cxld);
+ if (rc)
+ return rc;
+
+ dev_dbg(port->uport_dev, "%s added to %s\n",
+ dev_name(&cxld->dev), dev_name(&port->dev));
+
+ return 0;
+}
+
+/*
+ * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure)
+ * single ported host-bridges need not publish a decoder capability when a
+ * passthrough decode can be assumed, i.e. all transactions that the uport sees
+ * are claimed and passed to the single dport. Disable the range until the first
+ * CXL region is enumerated / activated.
+ */
+static int devm_cxl_add_passthrough_decoder(struct cxl_port *port)
+{
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+
+ /*
+ * Capability checks are moot for passthrough decoders, support
+ * any and all possibilities.
+ */
+ cxlhdm->interleave_mask = ~0U;
+ cxlhdm->iw_cap_mask = ~0UL;
+
+ cxlsd = cxl_switch_decoder_alloc(port, 1);
+ if (IS_ERR(cxlsd))
+ return PTR_ERR(cxlsd);
+
+ device_lock_assert(&port->dev);
+
+ return add_hdm_decoder(port, &cxlsd->cxld);
+}
+
+static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm)
+{
+ u32 hdm_cap;
+
+ hdm_cap = readl(cxlhdm->regs.hdm_decoder + CXL_HDM_DECODER_CAP_OFFSET);
+ cxlhdm->decoder_count = cxl_hdm_decoder_count(hdm_cap);
+ cxlhdm->target_count =
+ FIELD_GET(CXL_HDM_DECODER_TARGET_COUNT_MASK, hdm_cap);
+ if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_11_8, hdm_cap))
+ cxlhdm->interleave_mask |= GENMASK(11, 8);
+ if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap))
+ cxlhdm->interleave_mask |= GENMASK(14, 12);
+ cxlhdm->iw_cap_mask = BIT(1) | BIT(2) | BIT(4) | BIT(8);
+ if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY, hdm_cap))
+ cxlhdm->iw_cap_mask |= BIT(3) | BIT(6) | BIT(12);
+ if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_16_WAY, hdm_cap))
+ cxlhdm->iw_cap_mask |= BIT(16);
+}
+
+static bool should_emulate_decoders(struct cxl_endpoint_dvsec_info *info)
+{
+ struct cxl_hdm *cxlhdm;
+ void __iomem *hdm;
+ u32 ctrl;
+ int i;
+
+ if (!info)
+ return false;
+
+ cxlhdm = dev_get_drvdata(&info->port->dev);
+ hdm = cxlhdm->regs.hdm_decoder;
+
+ if (!hdm)
+ return true;
+
+ /*
+ * If HDM decoders are present and the driver is in control of
+ * Mem_Enable skip DVSEC based emulation
+ */
+ if (!info->mem_enabled)
+ return false;
+
+ /*
+ * If any decoders are committed already, there should not be any
+ * emulated DVSEC decoders.
+ */
+ for (i = 0; i < cxlhdm->decoder_count; i++) {
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i));
+ dev_dbg(&info->port->dev,
+ "decoder%d.%d: committed: %ld base: %#x_%.8x size: %#x_%.8x\n",
+ info->port->id, i,
+ FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl),
+ readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)),
+ readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)),
+ readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)),
+ readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i)));
+ if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl))
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * devm_cxl_setup_hdm - map HDM decoder component registers
+ * @port: cxl_port to map
+ * @info: cached DVSEC range register info
+ */
+static struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port,
+ struct cxl_endpoint_dvsec_info *info)
+{
+ struct cxl_register_map *reg_map = &port->reg_map;
+ struct device *dev = &port->dev;
+ struct cxl_hdm *cxlhdm;
+ int rc;
+
+ cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL);
+ if (!cxlhdm)
+ return ERR_PTR(-ENOMEM);
+ cxlhdm->port = port;
+ dev_set_drvdata(dev, cxlhdm);
+
+ /* Memory devices can configure device HDM using DVSEC range regs. */
+ if (reg_map->resource == CXL_RESOURCE_NONE) {
+ if (!info || !info->mem_enabled) {
+ dev_err(dev, "No component registers mapped\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ cxlhdm->decoder_count = info->ranges;
+ return cxlhdm;
+ }
+
+ if (!reg_map->component_map.hdm_decoder.valid) {
+ dev_dbg(&port->dev, "HDM decoder registers not implemented\n");
+ /* unique error code to indicate no HDM decoder capability */
+ return ERR_PTR(-ENODEV);
+ }
+
+ rc = cxl_map_component_regs(reg_map, &cxlhdm->regs,
+ BIT(CXL_CM_CAP_CAP_ID_HDM));
+ if (rc) {
+ dev_err(dev, "Failed to map HDM capability.\n");
+ return ERR_PTR(rc);
+ }
+
+ parse_hdm_decoder_caps(cxlhdm);
+ if (cxlhdm->decoder_count == 0) {
+ dev_err(dev, "Spec violation. Caps invalid\n");
+ return ERR_PTR(-ENXIO);
+ }
+
+ /*
+ * Now that the hdm capability is parsed, decide if range
+ * register emulation is needed and fixup cxlhdm accordingly.
+ */
+ if (should_emulate_decoders(info)) {
+ dev_dbg(dev, "Fallback map %d range register%s\n", info->ranges,
+ str_plural(info->ranges));
+ cxlhdm->decoder_count = info->ranges;
+ }
+
+ return cxlhdm;
+}
+
+static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth)
+{
+ unsigned long long start = r->start, end = r->end;
+
+ seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end,
+ r->name);
+}
+
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
+{
+ struct resource *p1, *p2;
+
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) {
+ __cxl_dpa_debug(file, p1, 0);
+ for (p2 = p1->child; p2; p2 = p2->sibling)
+ __cxl_dpa_debug(file, p2, 1);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
+
+/* See request_skip() kernel-doc */
+static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len,
+ const char *requester)
+{
+ const resource_size_t skip_end = skip_base + skip_len - 1;
+
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *part_res = &cxlds->part[i].res;
+ resource_size_t adjust_start, adjust_end, size;
+
+ adjust_start = max(skip_base, part_res->start);
+ adjust_end = min(skip_end, part_res->end);
+
+ if (adjust_end < adjust_start)
+ continue;
+
+ size = adjust_end - adjust_start + 1;
+
+ if (!requester)
+ __release_region(&cxlds->dpa_res, adjust_start, size);
+ else if (!__request_region(&cxlds->dpa_res, adjust_start, size,
+ requester, 0))
+ return adjust_start - skip_base;
+ }
+
+ return skip_len;
+}
+#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL)
+
+/*
+ * Must be called in a context that synchronizes against this decoder's
+ * port ->remove() callback (like an endpoint decoder sysfs attribute)
+ */
+static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxled_to_port(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct resource *res = cxled->dpa_res;
+ resource_size_t skip_start;
+
+ lockdep_assert_held_write(&cxl_rwsem.dpa);
+
+ /* save @skip_start, before @res is released */
+ skip_start = res->start - cxled->skip;
+ __release_region(&cxlds->dpa_res, res->start, resource_size(res));
+ if (cxled->skip)
+ release_skip(cxlds, skip_start, cxled->skip);
+ cxled->skip = 0;
+ cxled->dpa_res = NULL;
+ put_device(&cxled->cxld.dev);
+ port->hdm_end--;
+}
+
+static void cxl_dpa_release(void *cxled)
+{
+ guard(rwsem_write)(&cxl_rwsem.dpa);
+ __cxl_dpa_release(cxled);
+}
+
+/*
+ * Must be called from context that will not race port device
+ * unregistration, like decoder sysfs attribute methods
+ */
+static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+
+ lockdep_assert_held_write(&cxl_rwsem.dpa);
+ devm_remove_action(&port->dev, cxl_dpa_release, cxled);
+ __cxl_dpa_release(cxled);
+}
+
+/**
+ * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree
+ * @cxlds: CXL.mem device context that parents @cxled
+ * @cxled: Endpoint decoder establishing new allocation that skips lower DPA
+ * @skip_base: DPA < start of new DPA allocation (DPAnew)
+ * @skip_len: @skip_base + @skip_len == DPAnew
+ *
+ * DPA 'skip' arises from out-of-sequence DPA allocation events relative
+ * to free capacity across multiple partitions. It is a wasteful event
+ * as usable DPA gets thrown away, but if a deployment has, for example,
+ * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM
+ * DPA, the free RAM DPA must be sacrificed to start allocating PMEM.
+ * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder
+ * Protection" for more details.
+ *
+ * A 'skip' always covers the last allocated DPA in a previous partition
+ * to the start of the current partition to allocate. Allocations never
+ * start in the middle of a partition, and allocations are always
+ * de-allocated in reverse order (see cxl_dpa_free(), or natural devm
+ * unwind order from forced in-order allocation).
+ *
+ * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip'
+ * would always be contained to a single partition. Given
+ * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip'
+ * might span "tail capacity of partition[0], all of partition[1], ...,
+ * all of partition[N-1]" to support allocating from partition[N]. That
+ * in turn interacts with the partition 'struct resource' boundaries
+ * within @cxlds->dpa_res whereby 'skip' requests need to be divided by
+ * partition. I.e. this is a quirk of using a 'struct resource' tree to
+ * detect range conflicts while also tracking partition boundaries in
+ * @cxlds->dpa_res.
+ */
+static int request_skip(struct cxl_dev_state *cxlds,
+ struct cxl_endpoint_decoder *cxled,
+ const resource_size_t skip_base,
+ const resource_size_t skip_len)
+{
+ resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len,
+ dev_name(&cxled->cxld.dev));
+
+ if (skipped == skip_len)
+ return 0;
+
+ dev_dbg(cxlds->dev,
+ "%s: failed to reserve skipped space (%pa %pa %pa)\n",
+ dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped);
+
+ release_skip(cxlds, skip_base, skipped);
+
+ return -EBUSY;
+}
+
+static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+ resource_size_t base, resource_size_t len,
+ resource_size_t skipped)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxled_to_port(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct device *dev = &port->dev;
+ struct resource *res;
+ int rc;
+
+ lockdep_assert_held_write(&cxl_rwsem.dpa);
+
+ if (!len) {
+ dev_warn(dev, "decoder%d.%d: empty reservation attempted\n",
+ port->id, cxled->cxld.id);
+ return -EINVAL;
+ }
+
+ if (cxled->dpa_res) {
+ dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n",
+ port->id, cxled->cxld.id, cxled->dpa_res);
+ return -EBUSY;
+ }
+
+ if (port->hdm_end + 1 != cxled->cxld.id) {
+ /*
+ * Assumes alloc and commit order is always in hardware instance
+ * order per expectations from 8.2.5.12.20 Committing Decoder
+ * Programming that enforce decoder[m] committed before
+ * decoder[m+1] commit start.
+ */
+ dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id,
+ cxled->cxld.id, port->id, port->hdm_end + 1);
+ return -EBUSY;
+ }
+
+ if (skipped) {
+ rc = request_skip(cxlds, cxled, base - skipped, skipped);
+ if (rc)
+ return rc;
+ }
+ res = __request_region(&cxlds->dpa_res, base, len,
+ dev_name(&cxled->cxld.dev), 0);
+ if (!res) {
+ dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
+ port->id, cxled->cxld.id);
+ if (skipped)
+ release_skip(cxlds, base - skipped, skipped);
+ return -EBUSY;
+ }
+ cxled->dpa_res = res;
+ cxled->skip = skipped;
+
+ /*
+ * When allocating new capacity, ->part is already set, when
+ * discovering decoder settings at initial enumeration, ->part
+ * is not set.
+ */
+ if (cxled->part < 0)
+ for (int i = 0; cxlds->nr_partitions; i++)
+ if (resource_contains(&cxlds->part[i].res, res)) {
+ cxled->part = i;
+ break;
+ }
+
+ if (cxled->part < 0)
+ dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
+ port->id, cxled->cxld.id, res);
+
+ port->hdm_end++;
+ get_device(&cxled->cxld.dev);
+ return 0;
+}
+
+static int add_dpa_res(struct device *dev, struct resource *parent,
+ struct resource *res, resource_size_t start,
+ resource_size_t size, const char *type)
+{
+ int rc;
+
+ *res = (struct resource) {
+ .name = type,
+ .start = start,
+ .end = start + size - 1,
+ .flags = IORESOURCE_MEM,
+ };
+ if (resource_size(res) == 0) {
+ dev_dbg(dev, "DPA(%s): no capacity\n", res->name);
+ return 0;
+ }
+ rc = request_resource(parent, res);
+ if (rc) {
+ dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name,
+ res, rc);
+ return rc;
+ }
+
+ dev_dbg(dev, "DPA(%s): %pr\n", res->name, res);
+
+ return 0;
+}
+
+static const char *cxl_mode_name(enum cxl_partition_mode mode)
+{
+ switch (mode) {
+ case CXL_PARTMODE_RAM:
+ return "ram";
+ case CXL_PARTMODE_PMEM:
+ return "pmem";
+ default:
+ return "";
+ };
+}
+
+/* if this fails the caller must destroy @cxlds, there is no recovery */
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info)
+{
+ struct device *dev = cxlds->dev;
+
+ guard(rwsem_write)(&cxl_rwsem.dpa);
+
+ if (cxlds->nr_partitions)
+ return -EBUSY;
+
+ if (!info->size || !info->nr_partitions) {
+ cxlds->dpa_res = DEFINE_RES_MEM(0, 0);
+ cxlds->nr_partitions = 0;
+ return 0;
+ }
+
+ cxlds->dpa_res = DEFINE_RES_MEM(0, info->size);
+
+ for (int i = 0; i < info->nr_partitions; i++) {
+ const struct cxl_dpa_part_info *part = &info->part[i];
+ int rc;
+
+ cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID;
+ cxlds->part[i].mode = part->mode;
+
+ /* Require ordered + contiguous partitions */
+ if (i) {
+ const struct cxl_dpa_part_info *prev = &info->part[i - 1];
+
+ if (prev->range.end + 1 != part->range.start)
+ return -EINVAL;
+ }
+ rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res,
+ part->range.start, range_len(&part->range),
+ cxl_mode_name(part->mode));
+ if (rc)
+ return rc;
+ cxlds->nr_partitions++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_dpa_setup);
+
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+ resource_size_t base, resource_size_t len,
+ resource_size_t skipped)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+ int rc;
+
+ scoped_guard(rwsem_write, &cxl_rwsem.dpa)
+ rc = __cxl_dpa_reserve(cxled, base, len, skipped);
+
+ if (rc)
+ return rc;
+
+ return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL");
+
+resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled)
+{
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ if (cxled->dpa_res)
+ return resource_size(cxled->dpa_res);
+
+ return 0;
+}
+
+resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled)
+{
+ resource_size_t base = -1;
+
+ lockdep_assert_held(&cxl_rwsem.dpa);
+ if (cxled->dpa_res)
+ base = cxled->dpa_res->start;
+
+ return base;
+}
+
+bool cxl_resource_contains_addr(const struct resource *res, const resource_size_t addr)
+{
+ struct resource _addr = DEFINE_RES_MEM(addr, 1);
+
+ return resource_contains(res, &_addr);
+}
+
+int cxl_dpa_free(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+ struct device *dev = &cxled->cxld.dev;
+
+ guard(rwsem_write)(&cxl_rwsem.dpa);
+ if (!cxled->dpa_res)
+ return 0;
+ if (cxled->cxld.region) {
+ dev_dbg(dev, "decoder assigned to: %s\n",
+ dev_name(&cxled->cxld.region->dev));
+ return -EBUSY;
+ }
+ if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+ dev_dbg(dev, "decoder enabled\n");
+ return -EBUSY;
+ }
+ if (cxled->cxld.id != port->hdm_end) {
+ dev_dbg(dev, "expected decoder%d.%d\n", port->id,
+ port->hdm_end);
+ return -EBUSY;
+ }
+
+ devm_cxl_dpa_release(cxled);
+ return 0;
+}
+
+int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
+ enum cxl_partition_mode mode)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct device *dev = &cxled->cxld.dev;
+ int part;
+
+ guard(rwsem_write)(&cxl_rwsem.dpa);
+ if (cxled->cxld.flags & CXL_DECODER_F_ENABLE)
+ return -EBUSY;
+
+ for (part = 0; part < cxlds->nr_partitions; part++)
+ if (cxlds->part[part].mode == mode)
+ break;
+
+ if (part >= cxlds->nr_partitions) {
+ dev_dbg(dev, "unsupported mode: %d\n", mode);
+ return -EINVAL;
+ }
+
+ if (!resource_size(&cxlds->part[part].res)) {
+ dev_dbg(dev, "no available capacity for mode: %d\n", mode);
+ return -ENXIO;
+ }
+
+ cxled->part = part;
+ return 0;
+}
+
+static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct device *dev = &cxled->cxld.dev;
+ struct resource *res, *prev = NULL;
+ resource_size_t start, avail, skip, skip_start;
+ struct resource *p, *last;
+ int part;
+
+ guard(rwsem_write)(&cxl_rwsem.dpa);
+ if (cxled->cxld.region) {
+ dev_dbg(dev, "decoder attached to %s\n",
+ dev_name(&cxled->cxld.region->dev));
+ return -EBUSY;
+ }
+
+ if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) {
+ dev_dbg(dev, "decoder enabled\n");
+ return -EBUSY;
+ }
+
+ part = cxled->part;
+ if (part < 0) {
+ dev_dbg(dev, "partition not set\n");
+ return -EBUSY;
+ }
+
+ res = &cxlds->part[part].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last)
+ start = last->end + 1;
+ else
+ start = res->start;
+
+ /*
+ * To allocate at partition N, a skip needs to be calculated for all
+ * unallocated space at lower partitions indices.
+ *
+ * If a partition has any allocations, the search can end because a
+ * previous cxl_dpa_alloc() invocation is assumed to have accounted for
+ * all previous partitions.
+ */
+ skip_start = CXL_RESOURCE_NONE;
+ for (int i = part; i; i--) {
+ prev = &cxlds->part[i - 1].res;
+ for (p = prev->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last) {
+ skip_start = last->end + 1;
+ break;
+ }
+ skip_start = prev->start;
+ }
+
+ avail = res->end - start + 1;
+ if (skip_start == CXL_RESOURCE_NONE)
+ skip = 0;
+ else
+ skip = res->start - skip_start;
+
+ if (size > avail) {
+ dev_dbg(dev, "%llu exceeds available %s capacity: %llu\n", size,
+ res->name, (u64)avail);
+ return -ENOSPC;
+ }
+
+ return __cxl_dpa_reserve(cxled, start, size, skip);
+}
+
+int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size)
+{
+ struct cxl_port *port = cxled_to_port(cxled);
+ int rc;
+
+ rc = __cxl_dpa_alloc(cxled, size);
+ if (rc)
+ return rc;
+
+ return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled);
+}
+
+static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl)
+{
+ u16 eig;
+ u8 eiw;
+
+ /*
+ * Input validation ensures these warns never fire, but otherwise
+ * suppress unititalized variable usage warnings.
+ */
+ if (WARN_ONCE(ways_to_eiw(cxld->interleave_ways, &eiw),
+ "invalid interleave_ways: %d\n", cxld->interleave_ways))
+ return;
+ if (WARN_ONCE(granularity_to_eig(cxld->interleave_granularity, &eig),
+ "invalid interleave_granularity: %d\n",
+ cxld->interleave_granularity))
+ return;
+
+ u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK);
+ u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK);
+ *ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT;
+}
+
+static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl)
+{
+ u32p_replace_bits(ctrl,
+ !!(cxld->target_type == CXL_DECODER_HOSTONLYMEM),
+ CXL_HDM_DECODER0_CTRL_HOSTONLY);
+}
+
+static void cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt)
+{
+ struct cxl_dport **t = &cxlsd->target[0];
+ int ways = cxlsd->cxld.interleave_ways;
+
+ *tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id);
+ if (ways > 1)
+ *tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id);
+ if (ways > 2)
+ *tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id);
+ if (ways > 3)
+ *tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id);
+ if (ways > 4)
+ *tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id);
+ if (ways > 5)
+ *tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id);
+ if (ways > 6)
+ *tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id);
+ if (ways > 7)
+ *tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id);
+}
+
+/*
+ * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set
+ * committed or error within 10ms, but just be generous with 20ms to account for
+ * clock skew and other marginal behavior
+ */
+#define COMMIT_TIMEOUT_MS 20
+static int cxld_await_commit(void __iomem *hdm, int id)
+{
+ u32 ctrl;
+ int i;
+
+ for (i = 0; i < COMMIT_TIMEOUT_MS; i++) {
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+ if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) {
+ ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+ writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+ return -EIO;
+ }
+ if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl))
+ return 0;
+ fsleep(1000);
+ }
+
+ return -ETIMEDOUT;
+}
+
+static void setup_hw_decoder(struct cxl_decoder *cxld, void __iomem *hdm)
+{
+ int id = cxld->id;
+ u64 base, size;
+ u32 ctrl;
+
+ /* common decoder settings */
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id));
+ cxld_set_interleave(cxld, &ctrl);
+ cxld_set_type(cxld, &ctrl);
+ base = cxld->hpa_range.start;
+ size = range_len(&cxld->hpa_range);
+
+ writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+ writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+ writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+ writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+
+ if (is_switch_decoder(&cxld->dev)) {
+ struct cxl_switch_decoder *cxlsd =
+ to_cxl_switch_decoder(&cxld->dev);
+ void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id);
+ void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id);
+ u64 targets;
+
+ cxlsd_set_targets(cxlsd, &targets);
+ writel(upper_32_bits(targets), tl_hi);
+ writel(lower_32_bits(targets), tl_lo);
+ } else {
+ struct cxl_endpoint_decoder *cxled =
+ to_cxl_endpoint_decoder(&cxld->dev);
+ void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id);
+ void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id);
+
+ writel(upper_32_bits(cxled->skip), sk_hi);
+ writel(lower_32_bits(cxled->skip), sk_lo);
+ }
+
+ writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+}
+
+static int cxl_decoder_commit(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+ struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ int id = cxld->id, rc;
+
+ if (cxld->flags & CXL_DECODER_F_ENABLE)
+ return 0;
+
+ if (cxl_num_decoders_committed(port) != id) {
+ dev_dbg(&port->dev,
+ "%s: out of order commit, expected decoder%d.%d\n",
+ dev_name(&cxld->dev), port->id,
+ cxl_num_decoders_committed(port));
+ return -EBUSY;
+ }
+
+ /*
+ * For endpoint decoders hosted on CXL memory devices that
+ * support the sanitize operation, make sure sanitize is not in-flight.
+ */
+ if (is_endpoint_decoder(&cxld->dev)) {
+ struct cxl_endpoint_decoder *cxled =
+ to_cxl_endpoint_decoder(&cxld->dev);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_memdev_state *mds =
+ to_cxl_memdev_state(cxlmd->cxlds);
+
+ if (mds && mds->security.sanitize_active) {
+ dev_dbg(&cxlmd->dev,
+ "attempted to commit %s during sanitize\n",
+ dev_name(&cxld->dev));
+ return -EBUSY;
+ }
+ }
+
+ scoped_guard(rwsem_read, &cxl_rwsem.dpa)
+ setup_hw_decoder(cxld, hdm);
+
+ port->commit_end++;
+ rc = cxld_await_commit(hdm, cxld->id);
+ if (rc) {
+ dev_dbg(&port->dev, "%s: error %d committing decoder\n",
+ dev_name(&cxld->dev), rc);
+ cxld->reset(cxld);
+ return rc;
+ }
+ cxld->flags |= CXL_DECODER_F_ENABLE;
+
+ return 0;
+}
+
+static int commit_reap(struct device *dev, void *data)
+{
+ struct cxl_port *port = to_cxl_port(dev->parent);
+ struct cxl_decoder *cxld;
+
+ if (!is_switch_decoder(dev) && !is_endpoint_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ if (port->commit_end == cxld->id &&
+ ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n",
+ dev_name(&cxld->dev), port->commit_end);
+ }
+
+ return 0;
+}
+
+void cxl_port_commit_reap(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ /*
+ * Once the highest committed decoder is disabled, free any other
+ * decoders that were pinned allocated by out-of-order release.
+ */
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n", dev_name(&cxld->dev),
+ port->commit_end);
+ device_for_each_child_reverse_from(&port->dev, &cxld->dev, NULL,
+ commit_reap);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_commit_reap, "CXL");
+
+static void cxl_decoder_reset(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+ struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ int id = cxld->id;
+ u32 ctrl;
+
+ if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
+ return;
+
+ if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
+ if (port->commit_end == id)
+ cxl_port_commit_reap(cxld);
+ else
+ dev_dbg(&port->dev,
+ "%s: out of order reset, expected decoder%d.%d\n",
+ dev_name(&cxld->dev), port->id, port->commit_end);
+
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+ ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT;
+ writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
+
+ writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id));
+ writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id));
+ writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id));
+ writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
+
+ cxld->flags &= ~CXL_DECODER_F_ENABLE;
+
+ /* Userspace is now responsible for reconfiguring this decoder */
+ if (is_endpoint_decoder(&cxld->dev)) {
+ struct cxl_endpoint_decoder *cxled;
+
+ cxled = to_cxl_endpoint_decoder(&cxld->dev);
+ cxled->state = CXL_DECODER_STATE_MANUAL;
+ }
+}
+
+static int cxl_setup_hdm_decoder_from_dvsec(
+ struct cxl_port *port, struct cxl_decoder *cxld, u64 *dpa_base,
+ int which, struct cxl_endpoint_dvsec_info *info)
+{
+ struct cxl_endpoint_decoder *cxled;
+ u64 len;
+ int rc;
+
+ if (!is_cxl_endpoint(port))
+ return -EOPNOTSUPP;
+
+ cxled = to_cxl_endpoint_decoder(&cxld->dev);
+ len = range_len(&info->dvsec_range[which]);
+ if (!len)
+ return -ENOENT;
+
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ cxld->commit = NULL;
+ cxld->reset = NULL;
+ cxld->hpa_range = info->dvsec_range[which];
+
+ /*
+ * Set the emulated decoder as locked pending additional support to
+ * change the range registers at run time.
+ */
+ cxld->flags |= CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK;
+ port->commit_end = cxld->id;
+
+ rc = devm_cxl_dpa_reserve(cxled, *dpa_base, len, 0);
+ if (rc) {
+ dev_err(&port->dev,
+ "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)",
+ port->id, cxld->id, *dpa_base, *dpa_base + len - 1, rc);
+ return rc;
+ }
+ *dpa_base += len;
+ cxled->state = CXL_DECODER_STATE_AUTO;
+
+ return 0;
+}
+
+static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld,
+ void __iomem *hdm, int which,
+ u64 *dpa_base, struct cxl_endpoint_dvsec_info *info)
+{
+ struct cxl_endpoint_decoder *cxled = NULL;
+ u64 size, base, skip, dpa_size, lo, hi;
+ bool committed;
+ u32 remainder;
+ int i, rc;
+ u32 ctrl;
+ union {
+ u64 value;
+ unsigned char target_id[8];
+ } target_list;
+
+ if (should_emulate_decoders(info))
+ return cxl_setup_hdm_decoder_from_dvsec(port, cxld, dpa_base,
+ which, info);
+
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
+ lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which));
+ hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(which));
+ base = (hi << 32) + lo;
+ lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which));
+ hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(which));
+ size = (hi << 32) + lo;
+ committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED);
+ cxld->commit = cxl_decoder_commit;
+ cxld->reset = cxl_decoder_reset;
+
+ if (!committed)
+ size = 0;
+ if (base == U64_MAX || size == U64_MAX) {
+ dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n",
+ port->id, cxld->id);
+ return -ENXIO;
+ }
+
+ if (info)
+ cxled = to_cxl_endpoint_decoder(&cxld->dev);
+ cxld->hpa_range = (struct range) {
+ .start = base,
+ .end = base + size - 1,
+ };
+
+ /* decoders are enabled if committed */
+ if (committed) {
+ cxld->flags |= CXL_DECODER_F_ENABLE;
+ if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK)
+ cxld->flags |= CXL_DECODER_F_LOCK;
+ if (FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl))
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ else
+ cxld->target_type = CXL_DECODER_DEVMEM;
+
+ guard(rwsem_write)(&cxl_rwsem.region);
+ if (cxld->id != cxl_num_decoders_committed(port)) {
+ dev_warn(&port->dev,
+ "decoder%d.%d: Committed out of order\n",
+ port->id, cxld->id);
+ return -ENXIO;
+ }
+
+ if (size == 0) {
+ dev_warn(&port->dev,
+ "decoder%d.%d: Committed with zero size\n",
+ port->id, cxld->id);
+ return -ENXIO;
+ }
+ port->commit_end = cxld->id;
+ } else {
+ if (cxled) {
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ /*
+ * Default by devtype until a device arrives that needs
+ * more precision.
+ */
+ if (cxlds->type == CXL_DEVTYPE_CLASSMEM)
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ else
+ cxld->target_type = CXL_DECODER_DEVMEM;
+ } else {
+ /* To be overridden by region type at commit time */
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ }
+
+ if (!FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl) &&
+ cxld->target_type == CXL_DECODER_HOSTONLYMEM) {
+ ctrl |= CXL_HDM_DECODER0_CTRL_HOSTONLY;
+ writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which));
+ }
+ }
+ rc = eiw_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl),
+ &cxld->interleave_ways);
+ if (rc) {
+ dev_warn(&port->dev,
+ "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n",
+ port->id, cxld->id, ctrl);
+ return rc;
+ }
+ rc = eig_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl),
+ &cxld->interleave_granularity);
+ if (rc) {
+ dev_warn(&port->dev,
+ "decoder%d.%d: Invalid interleave granularity (ctrl: %#x)\n",
+ port->id, cxld->id, ctrl);
+ return rc;
+ }
+
+ dev_dbg(&port->dev, "decoder%d.%d: range: %#llx-%#llx iw: %d ig: %d\n",
+ port->id, cxld->id, cxld->hpa_range.start, cxld->hpa_range.end,
+ cxld->interleave_ways, cxld->interleave_granularity);
+
+ if (!cxled) {
+ lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(which));
+ hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(which));
+ target_list.value = (hi << 32) + lo;
+ for (i = 0; i < cxld->interleave_ways; i++)
+ cxld->target_map[i] = target_list.target_id[i];
+
+ return 0;
+ }
+
+ if (!committed)
+ return 0;
+
+ dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder);
+ if (remainder) {
+ dev_err(&port->dev,
+ "decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n",
+ port->id, cxld->id, size, cxld->interleave_ways);
+ return -ENXIO;
+ }
+ lo = readl(hdm + CXL_HDM_DECODER0_SKIP_LOW(which));
+ hi = readl(hdm + CXL_HDM_DECODER0_SKIP_HIGH(which));
+ skip = (hi << 32) + lo;
+ rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip);
+ if (rc) {
+ dev_err(&port->dev,
+ "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)",
+ port->id, cxld->id, *dpa_base,
+ *dpa_base + dpa_size + skip - 1, rc);
+ return rc;
+ }
+ *dpa_base += dpa_size + skip;
+
+ cxled->state = CXL_DECODER_STATE_AUTO;
+
+ return 0;
+}
+
+static void cxl_settle_decoders(struct cxl_hdm *cxlhdm)
+{
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ int committed, i;
+ u32 ctrl;
+
+ if (!hdm)
+ return;
+
+ /*
+ * Since the register resource was recently claimed via request_region()
+ * be careful about trusting the "not-committed" status until the commit
+ * timeout has elapsed. The commit timeout is 10ms (CXL 2.0
+ * 8.2.5.12.20), but double it to be tolerant of any clock skew between
+ * host and target.
+ */
+ for (i = 0, committed = 0; i < cxlhdm->decoder_count; i++) {
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i));
+ if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)
+ committed++;
+ }
+
+ /* ensure that future checks of committed can be trusted */
+ if (committed != cxlhdm->decoder_count)
+ msleep(20);
+}
+
+/**
+ * devm_cxl_enumerate_decoders - add decoder objects per HDM register set
+ * @cxlhdm: Structure to populate with HDM capabilities
+ * @info: cached DVSEC range register info
+ */
+static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm,
+ struct cxl_endpoint_dvsec_info *info)
+{
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ struct cxl_port *port = cxlhdm->port;
+ int i;
+ u64 dpa_base = 0;
+
+ cxl_settle_decoders(cxlhdm);
+
+ for (i = 0; i < cxlhdm->decoder_count; i++) {
+ int rc, target_count = cxlhdm->target_count;
+ struct cxl_decoder *cxld;
+
+ if (is_cxl_endpoint(port)) {
+ struct cxl_endpoint_decoder *cxled;
+
+ cxled = cxl_endpoint_decoder_alloc(port);
+ if (IS_ERR(cxled)) {
+ dev_warn(&port->dev,
+ "Failed to allocate decoder%d.%d\n",
+ port->id, i);
+ return PTR_ERR(cxled);
+ }
+ cxld = &cxled->cxld;
+ } else {
+ struct cxl_switch_decoder *cxlsd;
+
+ cxlsd = cxl_switch_decoder_alloc(port, target_count);
+ if (IS_ERR(cxlsd)) {
+ dev_warn(&port->dev,
+ "Failed to allocate decoder%d.%d\n",
+ port->id, i);
+ return PTR_ERR(cxlsd);
+ }
+ cxld = &cxlsd->cxld;
+ }
+
+ rc = init_hdm_decoder(port, cxld, hdm, i, &dpa_base, info);
+ if (rc) {
+ dev_warn(&port->dev,
+ "Failed to initialize decoder%d.%d\n",
+ port->id, i);
+ put_device(&cxld->dev);
+ return rc;
+ }
+ rc = add_hdm_decoder(port, cxld);
+ if (rc) {
+ dev_warn(&port->dev,
+ "Failed to add decoder%d.%d\n", port->id, i);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * __devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders
+ * @port: CXL port context
+ *
+ * Return 0 or -errno on error
+ */
+int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port)
+{
+ struct cxl_hdm *cxlhdm;
+
+ if (is_cxl_root(port) || is_cxl_endpoint(port))
+ return -EOPNOTSUPP;
+
+ cxlhdm = devm_cxl_setup_hdm(port, NULL);
+ if (!IS_ERR(cxlhdm))
+ return devm_cxl_enumerate_decoders(cxlhdm, NULL);
+
+ if (PTR_ERR(cxlhdm) != -ENODEV) {
+ dev_err(&port->dev, "Failed to map HDM decoder capability\n");
+ return PTR_ERR(cxlhdm);
+ }
+
+ if (cxl_port_get_possible_dports(port) == 1) {
+ dev_dbg(&port->dev, "Fallback to passthrough decoder\n");
+ return devm_cxl_add_passthrough_decoder(port);
+ }
+
+ dev_err(&port->dev, "HDM decoder capability not found\n");
+ return -ENXIO;
+}
+EXPORT_SYMBOL_NS_GPL(__devm_cxl_switch_port_decoders_setup, "CXL");
+
+/**
+ * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders
+ * @port: CXL port context
+ *
+ * Return 0 or -errno on error
+ */
+int devm_cxl_endpoint_decoders_setup(struct cxl_port *port)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ struct cxl_endpoint_dvsec_info info = { .port = port };
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_hdm *cxlhdm;
+ int rc;
+
+ if (!is_cxl_endpoint(port))
+ return -EOPNOTSUPP;
+
+ rc = cxl_dvsec_rr_decode(cxlds, &info);
+ if (rc < 0)
+ return rc;
+
+ cxlhdm = devm_cxl_setup_hdm(port, &info);
+ if (IS_ERR(cxlhdm)) {
+ if (PTR_ERR(cxlhdm) == -ENODEV)
+ dev_err(&port->dev, "HDM decoder registers not found\n");
+ return PTR_ERR(cxlhdm);
+ }
+
+ rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info);
+ if (rc)
+ return rc;
+
+ return devm_cxl_enumerate_decoders(cxlhdm, &info);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_endpoint_decoders_setup, "CXL");
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
new file mode 100644
index 000000000000..fa6dd0c94656
--- /dev/null
+++ b/drivers/cxl/core/mbox.c
@@ -0,0 +1,1552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/security.h>
+#include <linux/debugfs.h>
+#include <linux/ktime.h>
+#include <linux/mutex.h>
+#include <linux/unaligned.h>
+#include <cxlpci.h>
+#include <cxlmem.h>
+#include <cxl.h>
+
+#include "core.h"
+#include "trace.h"
+#include "mce.h"
+
+static bool cxl_raw_allow_all;
+
+/**
+ * DOC: cxl mbox
+ *
+ * Core implementation of the CXL 2.0 Type-3 Memory Device Mailbox. The
+ * implementation is used by the cxl_pci driver to initialize the device
+ * and implement the cxl_mem.h IOCTL UAPI. It also implements the
+ * backend of the cxl_pmem_ctl() transport for LIBNVDIMM.
+ */
+
+#define cxl_for_each_cmd(cmd) \
+ for ((cmd) = &cxl_mem_commands[0]; \
+ ((cmd) - cxl_mem_commands) < ARRAY_SIZE(cxl_mem_commands); (cmd)++)
+
+#define CXL_CMD(_id, sin, sout, _flags) \
+ [CXL_MEM_COMMAND_ID_##_id] = { \
+ .info = { \
+ .id = CXL_MEM_COMMAND_ID_##_id, \
+ .size_in = sin, \
+ .size_out = sout, \
+ }, \
+ .opcode = CXL_MBOX_OP_##_id, \
+ .flags = _flags, \
+ }
+
+#define CXL_VARIABLE_PAYLOAD ~0U
+/*
+ * This table defines the supported mailbox commands for the driver. This table
+ * is made up of a UAPI structure. Non-negative values as parameters in the
+ * table will be validated against the user's input. For example, if size_in is
+ * 0, and the user passed in 1, it is an error.
+ */
+static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
+ CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
+#ifdef CONFIG_CXL_MEM_RAW_COMMANDS
+ CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0),
+#endif
+ CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+ CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
+ CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
+ CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
+ CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
+ CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+ CXL_CMD(GET_LOG_CAPS, 0x10, 0x4, 0),
+ CXL_CMD(CLEAR_LOG, 0x10, 0, 0),
+ CXL_CMD(GET_SUP_LOG_SUBLIST, 0x2, CXL_VARIABLE_PAYLOAD, 0),
+ CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
+ CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0),
+ CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0),
+ CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
+ CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
+ CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
+ CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
+ CXL_CMD(GET_TIMESTAMP, 0, 0x8, 0),
+};
+
+/*
+ * Commands that RAW doesn't permit. The rationale for each:
+ *
+ * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
+ * coordination of transaction timeout values at the root bridge level.
+ *
+ * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
+ * and needs to be coordinated with HDM updates.
+ *
+ * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
+ * driver and any writes from userspace invalidates those contents.
+ *
+ * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
+ * to the device after it is marked clean, userspace can not make that
+ * assertion.
+ *
+ * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
+ * is kept up to date with patrol notifications and error management.
+ *
+ * CXL_MBOX_OP_[GET_,INJECT_,CLEAR_]POISON: These commands require kernel
+ * driver orchestration for safety.
+ */
+static u16 cxl_disabled_raw_commands[] = {
+ CXL_MBOX_OP_ACTIVATE_FW,
+ CXL_MBOX_OP_SET_PARTITION_INFO,
+ CXL_MBOX_OP_SET_LSA,
+ CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+ CXL_MBOX_OP_SCAN_MEDIA,
+ CXL_MBOX_OP_GET_SCAN_MEDIA,
+ CXL_MBOX_OP_GET_POISON,
+ CXL_MBOX_OP_INJECT_POISON,
+ CXL_MBOX_OP_CLEAR_POISON,
+};
+
+/*
+ * Command sets that RAW doesn't permit. All opcodes in this set are
+ * disabled because they pass plain text security payloads over the
+ * user/kernel boundary. This functionality is intended to be wrapped
+ * behind the keys ABI which allows for encrypted payloads in the UAPI
+ */
+static u8 security_command_sets[] = {
+ 0x44, /* Sanitize */
+ 0x45, /* Persistent Memory Data-at-rest Security */
+ 0x46, /* Security Passthrough */
+};
+
+static bool cxl_is_security_command(u16 opcode)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
+ if (security_command_sets[i] == (opcode >> 8))
+ return true;
+ return false;
+}
+
+static void cxl_set_security_cmd_enabled(struct cxl_security_state *security,
+ u16 opcode)
+{
+ switch (opcode) {
+ case CXL_MBOX_OP_SANITIZE:
+ set_bit(CXL_SEC_ENABLED_SANITIZE, security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_SECURE_ERASE:
+ set_bit(CXL_SEC_ENABLED_SECURE_ERASE,
+ security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_GET_SECURITY_STATE:
+ set_bit(CXL_SEC_ENABLED_GET_SECURITY_STATE,
+ security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_SET_PASSPHRASE:
+ set_bit(CXL_SEC_ENABLED_SET_PASSPHRASE,
+ security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_DISABLE_PASSPHRASE:
+ set_bit(CXL_SEC_ENABLED_DISABLE_PASSPHRASE,
+ security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_UNLOCK:
+ set_bit(CXL_SEC_ENABLED_UNLOCK, security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_FREEZE_SECURITY:
+ set_bit(CXL_SEC_ENABLED_FREEZE_SECURITY,
+ security->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE:
+ set_bit(CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE,
+ security->enabled_cmds);
+ break;
+ default:
+ break;
+ }
+}
+
+static bool cxl_is_poison_command(u16 opcode)
+{
+#define CXL_MBOX_OP_POISON_CMDS 0x43
+
+ if ((opcode >> 8) == CXL_MBOX_OP_POISON_CMDS)
+ return true;
+
+ return false;
+}
+
+static void cxl_set_poison_cmd_enabled(struct cxl_poison_state *poison,
+ u16 opcode)
+{
+ switch (opcode) {
+ case CXL_MBOX_OP_GET_POISON:
+ set_bit(CXL_POISON_ENABLED_LIST, poison->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_INJECT_POISON:
+ set_bit(CXL_POISON_ENABLED_INJECT, poison->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_CLEAR_POISON:
+ set_bit(CXL_POISON_ENABLED_CLEAR, poison->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS:
+ set_bit(CXL_POISON_ENABLED_SCAN_CAPS, poison->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_SCAN_MEDIA:
+ set_bit(CXL_POISON_ENABLED_SCAN_MEDIA, poison->enabled_cmds);
+ break;
+ case CXL_MBOX_OP_GET_SCAN_MEDIA:
+ set_bit(CXL_POISON_ENABLED_SCAN_RESULTS, poison->enabled_cmds);
+ break;
+ default:
+ break;
+ }
+}
+
+static struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
+{
+ struct cxl_mem_command *c;
+
+ cxl_for_each_cmd(c)
+ if (c->opcode == opcode)
+ return c;
+
+ return NULL;
+}
+
+static const char *cxl_mem_opcode_to_name(u16 opcode)
+{
+ struct cxl_mem_command *c;
+
+ c = cxl_mem_find_command(opcode);
+ if (!c)
+ return NULL;
+
+ return cxl_command_names[c->info.id].name;
+}
+
+/**
+ * cxl_internal_send_cmd() - Kernel internal interface to send a mailbox command
+ * @cxl_mbox: CXL mailbox context
+ * @mbox_cmd: initialized command to execute
+ *
+ * Context: Any context.
+ * Return:
+ * * %>=0 - Number of bytes returned in @out.
+ * * %-E2BIG - Payload is too large for hardware.
+ * * %-EBUSY - Couldn't acquire exclusive mailbox access.
+ * * %-EFAULT - Hardware error occurred.
+ * * %-ENXIO - Command completed, but device reported an error.
+ * * %-EIO - Unexpected output size.
+ *
+ * Mailbox commands may execute successfully yet the device itself reported an
+ * error. While this distinction can be useful for commands from userspace, the
+ * kernel will only be able to use results when both are successful.
+ */
+int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_cmd *mbox_cmd)
+{
+ size_t out_size, min_out;
+ int rc;
+
+ if (mbox_cmd->size_in > cxl_mbox->payload_size ||
+ mbox_cmd->size_out > cxl_mbox->payload_size)
+ return -E2BIG;
+
+ out_size = mbox_cmd->size_out;
+ min_out = mbox_cmd->min_out;
+ rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd);
+ /*
+ * EIO is reserved for a payload size mismatch and mbox_send()
+ * may not return this error.
+ */
+ if (WARN_ONCE(rc == -EIO, "Bad return code: -EIO"))
+ return -ENXIO;
+ if (rc)
+ return rc;
+
+ if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS &&
+ mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND)
+ return cxl_mbox_cmd_rc2errno(mbox_cmd);
+
+ if (!out_size)
+ return 0;
+
+ /*
+ * Variable sized output needs to at least satisfy the caller's
+ * minimum if not the fully requested size.
+ */
+ if (min_out == 0)
+ min_out = out_size;
+
+ if (mbox_cmd->size_out < min_out)
+ return -EIO;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_internal_send_cmd, "CXL");
+
+static bool cxl_mem_raw_command_allowed(u16 opcode)
+{
+ int i;
+
+ if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
+ return false;
+
+ if (security_locked_down(LOCKDOWN_PCI_ACCESS))
+ return false;
+
+ if (cxl_raw_allow_all)
+ return true;
+
+ if (cxl_is_security_command(opcode))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
+ if (cxl_disabled_raw_commands[i] == opcode)
+ return false;
+
+ return true;
+}
+
+/**
+ * cxl_payload_from_user_allowed() - Check contents of in_payload.
+ * @opcode: The mailbox command opcode.
+ * @payload_in: Pointer to the input payload passed in from user space.
+ *
+ * Return:
+ * * true - payload_in passes check for @opcode.
+ * * false - payload_in contains invalid or unsupported values.
+ *
+ * The driver may inspect payload contents before sending a mailbox
+ * command from user space to the device. The intent is to reject
+ * commands with input payloads that are known to be unsafe. This
+ * check is not intended to replace the users careful selection of
+ * mailbox command parameters and makes no guarantee that the user
+ * command will succeed, nor that it is appropriate.
+ *
+ * The specific checks are determined by the opcode.
+ */
+static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in)
+{
+ switch (opcode) {
+ case CXL_MBOX_OP_SET_PARTITION_INFO: {
+ struct cxl_mbox_set_partition_info *pi = payload_in;
+
+ if (pi->flags & CXL_SET_PARTITION_IMMEDIATE_FLAG)
+ return false;
+ break;
+ }
+ case CXL_MBOX_OP_CLEAR_LOG: {
+ const uuid_t *uuid = (uuid_t *)payload_in;
+
+ /*
+ * Restrict the ‘Clear log’ action to only apply to
+ * Vendor debug logs.
+ */
+ return uuid_equal(uuid, &DEFINE_CXL_VENDOR_DEBUG_UUID);
+ }
+ default:
+ break;
+ }
+ return true;
+}
+
+static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox_cmd,
+ struct cxl_mailbox *cxl_mbox, u16 opcode,
+ size_t in_size, size_t out_size, u64 in_payload)
+{
+ *mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = opcode,
+ .size_in = in_size,
+ };
+
+ if (in_size) {
+ mbox_cmd->payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
+ in_size);
+ if (IS_ERR(mbox_cmd->payload_in))
+ return PTR_ERR(mbox_cmd->payload_in);
+
+ if (!cxl_payload_from_user_allowed(opcode, mbox_cmd->payload_in)) {
+ dev_dbg(cxl_mbox->host, "%s: input payload not allowed\n",
+ cxl_mem_opcode_to_name(opcode));
+ kvfree(mbox_cmd->payload_in);
+ return -EBUSY;
+ }
+ }
+
+ /* Prepare to handle a full payload for variable sized output */
+ if (out_size == CXL_VARIABLE_PAYLOAD)
+ mbox_cmd->size_out = cxl_mbox->payload_size;
+ else
+ mbox_cmd->size_out = out_size;
+
+ if (mbox_cmd->size_out) {
+ mbox_cmd->payload_out = kvzalloc(mbox_cmd->size_out, GFP_KERNEL);
+ if (!mbox_cmd->payload_out) {
+ kvfree(mbox_cmd->payload_in);
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static void cxl_mbox_cmd_dtor(struct cxl_mbox_cmd *mbox)
+{
+ kvfree(mbox->payload_in);
+ kvfree(mbox->payload_out);
+}
+
+static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd,
+ const struct cxl_send_command *send_cmd,
+ struct cxl_mailbox *cxl_mbox)
+{
+ if (send_cmd->raw.rsvd)
+ return -EINVAL;
+
+ /*
+ * Unlike supported commands, the output size of RAW commands
+ * gets passed along without further checking, so it must be
+ * validated here.
+ */
+ if (send_cmd->out.size > cxl_mbox->payload_size)
+ return -EINVAL;
+
+ if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
+ return -EPERM;
+
+ dev_WARN_ONCE(cxl_mbox->host, true, "raw command path used\n");
+
+ *mem_cmd = (struct cxl_mem_command) {
+ .info = {
+ .id = CXL_MEM_COMMAND_ID_RAW,
+ .size_in = send_cmd->in.size,
+ .size_out = send_cmd->out.size,
+ },
+ .opcode = send_cmd->raw.opcode
+ };
+
+ return 0;
+}
+
+static int cxl_to_mem_cmd(struct cxl_mem_command *mem_cmd,
+ const struct cxl_send_command *send_cmd,
+ struct cxl_mailbox *cxl_mbox)
+{
+ struct cxl_mem_command *c = &cxl_mem_commands[send_cmd->id];
+ const struct cxl_command_info *info = &c->info;
+
+ if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
+ return -EINVAL;
+
+ if (send_cmd->rsvd)
+ return -EINVAL;
+
+ if (send_cmd->in.rsvd || send_cmd->out.rsvd)
+ return -EINVAL;
+
+ /* Check that the command is enabled for hardware */
+ if (!test_bit(info->id, cxl_mbox->enabled_cmds))
+ return -ENOTTY;
+
+ /* Check that the command is not claimed for exclusive kernel use */
+ if (test_bit(info->id, cxl_mbox->exclusive_cmds))
+ return -EBUSY;
+
+ /* Check the input buffer is the expected size */
+ if ((info->size_in != CXL_VARIABLE_PAYLOAD) &&
+ (info->size_in != send_cmd->in.size))
+ return -ENOMEM;
+
+ /* Check the output buffer is at least large enough */
+ if ((info->size_out != CXL_VARIABLE_PAYLOAD) &&
+ (send_cmd->out.size < info->size_out))
+ return -ENOMEM;
+
+ *mem_cmd = (struct cxl_mem_command) {
+ .info = {
+ .id = info->id,
+ .flags = info->flags,
+ .size_in = send_cmd->in.size,
+ .size_out = send_cmd->out.size,
+ },
+ .opcode = c->opcode
+ };
+
+ return 0;
+}
+
+/**
+ * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
+ * @mbox_cmd: Sanitized and populated &struct cxl_mbox_cmd.
+ * @cxl_mbox: CXL mailbox context
+ * @send_cmd: &struct cxl_send_command copied in from userspace.
+ *
+ * Return:
+ * * %0 - @out_cmd is ready to send.
+ * * %-ENOTTY - Invalid command specified.
+ * * %-EINVAL - Reserved fields or invalid values were used.
+ * * %-ENOMEM - Input or output buffer wasn't sized properly.
+ * * %-EPERM - Attempted to use a protected command.
+ * * %-EBUSY - Kernel has claimed exclusive access to this opcode
+ *
+ * The result of this command is a fully validated command in @mbox_cmd that is
+ * safe to send to the hardware.
+ */
+static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd,
+ struct cxl_mailbox *cxl_mbox,
+ const struct cxl_send_command *send_cmd)
+{
+ struct cxl_mem_command mem_cmd;
+ int rc;
+
+ if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
+ return -ENOTTY;
+
+ /*
+ * The user can never specify an input payload larger than what hardware
+ * supports, but output can be arbitrarily large (simply write out as
+ * much data as the hardware provides).
+ */
+ if (send_cmd->in.size > cxl_mbox->payload_size)
+ return -EINVAL;
+
+ /* Sanitize and construct a cxl_mem_command */
+ if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW)
+ rc = cxl_to_mem_cmd_raw(&mem_cmd, send_cmd, cxl_mbox);
+ else
+ rc = cxl_to_mem_cmd(&mem_cmd, send_cmd, cxl_mbox);
+
+ if (rc)
+ return rc;
+
+ /* Sanitize and construct a cxl_mbox_cmd */
+ return cxl_mbox_cmd_ctor(mbox_cmd, cxl_mbox, mem_cmd.opcode,
+ mem_cmd.info.size_in, mem_cmd.info.size_out,
+ send_cmd->in.payload);
+}
+
+int cxl_query_cmd(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mem_query_commands __user *q)
+{
+ struct device *dev = cxl_mbox->host;
+ struct cxl_mem_command *cmd;
+ u32 n_commands;
+ int j = 0;
+
+ dev_dbg(dev, "Query IOCTL\n");
+
+ if (get_user(n_commands, &q->n_commands))
+ return -EFAULT;
+
+ /* returns the total number if 0 elements are requested. */
+ if (n_commands == 0)
+ return put_user(ARRAY_SIZE(cxl_mem_commands), &q->n_commands);
+
+ /*
+ * otherwise, return min(n_commands, total commands) cxl_command_info
+ * structures.
+ */
+ cxl_for_each_cmd(cmd) {
+ struct cxl_command_info info = cmd->info;
+
+ if (test_bit(info.id, cxl_mbox->enabled_cmds))
+ info.flags |= CXL_MEM_COMMAND_FLAG_ENABLED;
+ if (test_bit(info.id, cxl_mbox->exclusive_cmds))
+ info.flags |= CXL_MEM_COMMAND_FLAG_EXCLUSIVE;
+
+ if (copy_to_user(&q->commands[j++], &info, sizeof(info)))
+ return -EFAULT;
+
+ if (j == n_commands)
+ break;
+ }
+
+ return 0;
+}
+
+/**
+ * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
+ * @cxl_mbox: The mailbox context for the operation.
+ * @mbox_cmd: The validated mailbox command.
+ * @out_payload: Pointer to userspace's output payload.
+ * @size_out: (Input) Max payload size to copy out.
+ * (Output) Payload size hardware generated.
+ * @retval: Hardware generated return code from the operation.
+ *
+ * Return:
+ * * %0 - Mailbox transaction succeeded. This implies the mailbox
+ * protocol completed successfully not that the operation itself
+ * was successful.
+ * * %-ENOMEM - Couldn't allocate a bounce buffer.
+ * * %-EFAULT - Something happened with copy_to/from_user.
+ * * %-EINTR - Mailbox acquisition interrupted.
+ * * %-EXXX - Transaction level failures.
+ *
+ * Dispatches a mailbox command on behalf of a userspace request.
+ * The output payload is copied to userspace.
+ *
+ * See cxl_send_cmd().
+ */
+static int handle_mailbox_cmd_from_user(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_cmd *mbox_cmd,
+ u64 out_payload, s32 *size_out,
+ u32 *retval)
+{
+ struct device *dev = cxl_mbox->host;
+ int rc;
+
+ dev_dbg(dev,
+ "Submitting %s command for user\n"
+ "\topcode: %x\n"
+ "\tsize: %zx\n",
+ cxl_mem_opcode_to_name(mbox_cmd->opcode),
+ mbox_cmd->opcode, mbox_cmd->size_in);
+
+ rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd);
+ if (rc)
+ goto out;
+
+ /*
+ * @size_out contains the max size that's allowed to be written back out
+ * to userspace. While the payload may have written more output than
+ * this it will have to be ignored.
+ */
+ if (mbox_cmd->size_out) {
+ dev_WARN_ONCE(dev, mbox_cmd->size_out > *size_out,
+ "Invalid return size\n");
+ if (copy_to_user(u64_to_user_ptr(out_payload),
+ mbox_cmd->payload_out, mbox_cmd->size_out)) {
+ rc = -EFAULT;
+ goto out;
+ }
+ }
+
+ *size_out = mbox_cmd->size_out;
+ *retval = mbox_cmd->return_code;
+
+out:
+ cxl_mbox_cmd_dtor(mbox_cmd);
+ return rc;
+}
+
+int cxl_send_cmd(struct cxl_mailbox *cxl_mbox, struct cxl_send_command __user *s)
+{
+ struct device *dev = cxl_mbox->host;
+ struct cxl_send_command send;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ dev_dbg(dev, "Send IOCTL\n");
+
+ if (copy_from_user(&send, s, sizeof(send)))
+ return -EFAULT;
+
+ rc = cxl_validate_cmd_from_user(&mbox_cmd, cxl_mbox, &send);
+ if (rc)
+ return rc;
+
+ rc = handle_mailbox_cmd_from_user(cxl_mbox, &mbox_cmd, send.out.payload,
+ &send.out.size, &send.retval);
+ if (rc)
+ return rc;
+
+ if (copy_to_user(s, &send, sizeof(send)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid,
+ u32 *size, u8 *out)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ u32 remaining = *size;
+ u32 offset = 0;
+
+ while (remaining) {
+ u32 xfer_size = min_t(u32, remaining, cxl_mbox->payload_size);
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_mbox_get_log log;
+ int rc;
+
+ log = (struct cxl_mbox_get_log) {
+ .uuid = *uuid,
+ .offset = cpu_to_le32(offset),
+ .length = cpu_to_le32(xfer_size),
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_LOG,
+ .size_in = sizeof(log),
+ .payload_in = &log,
+ .size_out = xfer_size,
+ .payload_out = out,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+
+ /*
+ * The output payload length that indicates the number
+ * of valid bytes can be smaller than the Log buffer
+ * size.
+ */
+ if (rc == -EIO && mbox_cmd.size_out < xfer_size) {
+ offset += mbox_cmd.size_out;
+ break;
+ }
+
+ if (rc < 0)
+ return rc;
+
+ out += xfer_size;
+ remaining -= xfer_size;
+ offset += xfer_size;
+ }
+
+ *size = offset;
+
+ return 0;
+}
+
+static int check_features_opcodes(u16 opcode, int *ro_cmds, int *wr_cmds)
+{
+ switch (opcode) {
+ case CXL_MBOX_OP_GET_SUPPORTED_FEATURES:
+ case CXL_MBOX_OP_GET_FEATURE:
+ (*ro_cmds)++;
+ return 1;
+ case CXL_MBOX_OP_SET_FEATURE:
+ (*wr_cmds)++;
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/* 'Get Supported Features' and 'Get Feature' */
+#define MAX_FEATURES_READ_CMDS 2
+static void set_features_cap(struct cxl_mailbox *cxl_mbox,
+ int ro_cmds, int wr_cmds)
+{
+ /* Setting up Features capability while walking the CEL */
+ if (ro_cmds == MAX_FEATURES_READ_CMDS) {
+ if (wr_cmds)
+ cxl_mbox->feat_cap = CXL_FEATURES_RW;
+ else
+ cxl_mbox->feat_cap = CXL_FEATURES_RO;
+ }
+}
+
+/**
+ * cxl_walk_cel() - Walk through the Command Effects Log.
+ * @mds: The driver data for the operation
+ * @size: Length of the Command Effects Log.
+ * @cel: CEL
+ *
+ * Iterate over each entry in the CEL and determine if the driver supports the
+ * command. If so, the command is enabled for the device and can be used later.
+ */
+static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_cel_entry *cel_entry;
+ const int cel_entries = size / sizeof(*cel_entry);
+ struct device *dev = mds->cxlds.dev;
+ int i, ro_cmds = 0, wr_cmds = 0;
+
+ cel_entry = (struct cxl_cel_entry *) cel;
+
+ for (i = 0; i < cel_entries; i++) {
+ u16 opcode = le16_to_cpu(cel_entry[i].opcode);
+ struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
+ int enabled = 0;
+
+ if (cmd) {
+ set_bit(cmd->info.id, cxl_mbox->enabled_cmds);
+ enabled++;
+ }
+
+ enabled += check_features_opcodes(opcode, &ro_cmds,
+ &wr_cmds);
+
+ if (cxl_is_poison_command(opcode)) {
+ cxl_set_poison_cmd_enabled(&mds->poison, opcode);
+ enabled++;
+ }
+
+ if (cxl_is_security_command(opcode)) {
+ cxl_set_security_cmd_enabled(&mds->security, opcode);
+ enabled++;
+ }
+
+ dev_dbg(dev, "Opcode 0x%04x %s\n", opcode,
+ enabled ? "enabled" : "unsupported by driver");
+ }
+
+ set_features_cap(cxl_mbox, ro_cmds, wr_cmds);
+}
+
+static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_supported_logs *ret;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ ret = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_SUPPORTED_LOGS,
+ .size_out = cxl_mbox->payload_size,
+ .payload_out = ret,
+ /* At least the record number field must be valid */
+ .min_out = 2,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0) {
+ kvfree(ret);
+ return ERR_PTR(rc);
+ }
+
+
+ return ret;
+}
+
+enum {
+ CEL_UUID,
+ VENDOR_DEBUG_UUID,
+};
+
+/* See CXL 2.0 Table 170. Get Log Input Payload */
+static const uuid_t log_uuid[] = {
+ [CEL_UUID] = DEFINE_CXL_CEL_UUID,
+ [VENDOR_DEBUG_UUID] = DEFINE_CXL_VENDOR_DEBUG_UUID,
+};
+
+/**
+ * cxl_enumerate_cmds() - Enumerate commands for a device.
+ * @mds: The driver data for the operation
+ *
+ * Returns 0 if enumerate completed successfully.
+ *
+ * CXL devices have optional support for certain commands. This function will
+ * determine the set of supported commands for the hardware and update the
+ * enabled_cmds bitmap in the @mds.
+ */
+int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_supported_logs *gsl;
+ struct device *dev = mds->cxlds.dev;
+ struct cxl_mem_command *cmd;
+ int i, rc;
+
+ gsl = cxl_get_gsl(mds);
+ if (IS_ERR(gsl))
+ return PTR_ERR(gsl);
+
+ rc = -ENOENT;
+ for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
+ u32 size = le32_to_cpu(gsl->entry[i].size);
+ uuid_t uuid = gsl->entry[i].uuid;
+ u8 *log;
+
+ dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
+
+ if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
+ continue;
+
+ log = kvmalloc(size, GFP_KERNEL);
+ if (!log) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ rc = cxl_xfer_log(mds, &uuid, &size, log);
+ if (rc) {
+ kvfree(log);
+ goto out;
+ }
+
+ cxl_walk_cel(mds, size, log);
+ kvfree(log);
+
+ /* In case CEL was bogus, enable some default commands. */
+ cxl_for_each_cmd(cmd)
+ if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
+ set_bit(cmd->info.id, cxl_mbox->enabled_cmds);
+
+ /* Found the required CEL */
+ rc = 0;
+ }
+out:
+ kvfree(gsl);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, "CXL");
+
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+ enum cxl_event_log_type type,
+ enum cxl_event_type event_type,
+ const uuid_t *uuid, union cxl_event *evt)
+{
+ if (event_type == CXL_CPER_EVENT_MEM_MODULE) {
+ trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
+ return;
+ }
+ if (event_type == CXL_CPER_EVENT_GENERIC) {
+ trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
+ return;
+ }
+ if (event_type == CXL_CPER_EVENT_MEM_SPARING) {
+ trace_cxl_memory_sparing(cxlmd, type, &evt->mem_sparing);
+ return;
+ }
+
+ if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
+ u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX;
+ struct cxl_region *cxlr;
+
+ /*
+ * These trace points are annotated with HPA and region
+ * translations. Take topology mutation locks and lookup
+ * { HPA, REGION } from { DPA, MEMDEV } in the event record.
+ */
+ guard(rwsem_read)(&cxl_rwsem.region);
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+
+ dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK;
+ cxlr = cxl_dpa_to_region(cxlmd, dpa);
+ if (cxlr) {
+ u64 cache_size = cxlr->params.cache_size;
+
+ hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa);
+ if (cache_size)
+ hpa_alias = hpa - cache_size;
+ }
+
+ if (event_type == CXL_CPER_EVENT_GEN_MEDIA) {
+ if (cxl_store_rec_gen_media((struct cxl_memdev *)cxlmd, evt))
+ dev_dbg(&cxlmd->dev, "CXL store rec_gen_media failed\n");
+
+ if (evt->gen_media.media_hdr.descriptor &
+ CXL_GMER_EVT_DESC_THRESHOLD_EVENT)
+ WARN_ON_ONCE((evt->gen_media.media_hdr.type &
+ CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE) &&
+ !get_unaligned_le24(evt->gen_media.cme_count));
+ else
+ WARN_ON_ONCE(evt->gen_media.media_hdr.type &
+ CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE);
+
+ trace_cxl_general_media(cxlmd, type, cxlr, hpa,
+ hpa_alias, &evt->gen_media);
+ } else if (event_type == CXL_CPER_EVENT_DRAM) {
+ if (cxl_store_rec_dram((struct cxl_memdev *)cxlmd, evt))
+ dev_dbg(&cxlmd->dev, "CXL store rec_dram failed\n");
+
+ if (evt->dram.media_hdr.descriptor &
+ CXL_GMER_EVT_DESC_THRESHOLD_EVENT)
+ WARN_ON_ONCE((evt->dram.media_hdr.type &
+ CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE) &&
+ !get_unaligned_le24(evt->dram.cvme_count));
+ else
+ WARN_ON_ONCE(evt->dram.media_hdr.type &
+ CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE);
+
+ trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias,
+ &evt->dram);
+ }
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL");
+
+static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+ enum cxl_event_log_type type,
+ struct cxl_event_record_raw *record)
+{
+ enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC;
+ const uuid_t *uuid = &record->id;
+
+ if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID))
+ ev_type = CXL_CPER_EVENT_GEN_MEDIA;
+ else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID))
+ ev_type = CXL_CPER_EVENT_DRAM;
+ else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID))
+ ev_type = CXL_CPER_EVENT_MEM_MODULE;
+ else if (uuid_equal(uuid, &CXL_EVENT_MEM_SPARING_UUID))
+ ev_type = CXL_CPER_EVENT_MEM_SPARING;
+
+ cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event);
+}
+
+static int cxl_clear_event_record(struct cxl_memdev_state *mds,
+ enum cxl_event_log_type log,
+ struct cxl_get_event_payload *get_pl)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_clear_event_payload *payload;
+ u16 total = le16_to_cpu(get_pl->record_count);
+ u8 max_handles = CXL_CLEAR_EVENT_MAX_HANDLES;
+ size_t pl_size = struct_size(payload, handles, max_handles);
+ struct cxl_mbox_cmd mbox_cmd;
+ u16 cnt;
+ int rc = 0;
+ int i;
+
+ /* Payload size may limit the max handles */
+ if (pl_size > cxl_mbox->payload_size) {
+ max_handles = (cxl_mbox->payload_size - sizeof(*payload)) /
+ sizeof(__le16);
+ pl_size = struct_size(payload, handles, max_handles);
+ }
+
+ payload = kvzalloc(pl_size, GFP_KERNEL);
+ if (!payload)
+ return -ENOMEM;
+
+ *payload = (struct cxl_mbox_clear_event_payload) {
+ .event_log = log,
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_CLEAR_EVENT_RECORD,
+ .payload_in = payload,
+ .size_in = pl_size,
+ };
+
+ /*
+ * Clear Event Records uses u8 for the handle cnt while Get Event
+ * Record can return up to 0xffff records.
+ */
+ i = 0;
+ for (cnt = 0; cnt < total; cnt++) {
+ struct cxl_event_record_raw *raw = &get_pl->records[cnt];
+ struct cxl_event_generic *gen = &raw->event.generic;
+
+ payload->handles[i++] = gen->hdr.handle;
+ dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log,
+ le16_to_cpu(payload->handles[i - 1]));
+
+ if (i == max_handles) {
+ payload->nr_recs = i;
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ goto free_pl;
+ i = 0;
+ }
+ }
+
+ /* Clear what is left if any */
+ if (i) {
+ payload->nr_recs = i;
+ mbox_cmd.size_in = struct_size(payload, handles, i);
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ goto free_pl;
+ }
+
+free_pl:
+ kvfree(payload);
+ return rc;
+}
+
+static void cxl_mem_get_records_log(struct cxl_memdev_state *mds,
+ enum cxl_event_log_type type)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+ struct device *dev = mds->cxlds.dev;
+ struct cxl_get_event_payload *payload;
+ u8 log_type = type;
+ u16 nr_rec;
+
+ mutex_lock(&mds->event.log_lock);
+ payload = mds->event.buf;
+
+ do {
+ int rc, i;
+ struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_EVENT_RECORD,
+ .payload_in = &log_type,
+ .size_in = sizeof(log_type),
+ .payload_out = payload,
+ .size_out = cxl_mbox->payload_size,
+ .min_out = struct_size(payload, records, 0),
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc) {
+ dev_err_ratelimited(dev,
+ "Event log '%d': Failed to query event records : %d",
+ type, rc);
+ break;
+ }
+
+ nr_rec = le16_to_cpu(payload->record_count);
+ if (!nr_rec)
+ break;
+
+ for (i = 0; i < nr_rec; i++)
+ __cxl_event_trace_record(cxlmd, type,
+ &payload->records[i]);
+
+ if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW)
+ trace_cxl_overflow(cxlmd, type, payload);
+
+ rc = cxl_clear_event_record(mds, type, payload);
+ if (rc) {
+ dev_err_ratelimited(dev,
+ "Event log '%d': Failed to clear events : %d",
+ type, rc);
+ break;
+ }
+ } while (nr_rec);
+
+ mutex_unlock(&mds->event.log_lock);
+}
+
+/**
+ * cxl_mem_get_event_records - Get Event Records from the device
+ * @mds: The driver data for the operation
+ * @status: Event Status register value identifying which events are available.
+ *
+ * Retrieve all event records available on the device, report them as trace
+ * events, and clear them.
+ *
+ * See CXL rev 3.0 @8.2.9.2.2 Get Event Records
+ * See CXL rev 3.0 @8.2.9.2.3 Clear Event Records
+ */
+void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status)
+{
+ dev_dbg(mds->cxlds.dev, "Reading event logs: %x\n", status);
+
+ if (status & CXLDEV_EVENT_STATUS_FATAL)
+ cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FATAL);
+ if (status & CXLDEV_EVENT_STATUS_FAIL)
+ cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FAIL);
+ if (status & CXLDEV_EVENT_STATUS_WARN)
+ cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_WARN);
+ if (status & CXLDEV_EVENT_STATUS_INFO)
+ cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_INFO);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL");
+
+/**
+ * cxl_mem_get_partition_info - Get partition info
+ * @mds: The driver data for the operation
+ *
+ * Retrieve the current partition info for the device specified. The active
+ * values are the current capacity in bytes. If not 0, the 'next' values are
+ * the pending values, in bytes, which take affect on next cold reset.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL @8.2.9.5.2.1 Get Partition Info
+ */
+static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_partition_info pi;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_PARTITION_INFO,
+ .size_out = sizeof(pi),
+ .payload_out = &pi,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ return rc;
+
+ mds->active_volatile_bytes =
+ le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER;
+ mds->active_persistent_bytes =
+ le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER;
+
+ return 0;
+}
+
+/**
+ * cxl_dev_state_identify() - Send the IDENTIFY command to the device.
+ * @mds: The driver data for the operation
+ *
+ * Return: 0 if identify was executed successfully or media not ready.
+ *
+ * This will dispatch the identify command to the device and on success populate
+ * structures to be exported to sysfs.
+ */
+int cxl_dev_state_identify(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
+ struct cxl_mbox_identify id;
+ struct cxl_mbox_cmd mbox_cmd;
+ u32 val;
+ int rc;
+
+ if (!mds->cxlds.media_ready)
+ return 0;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_IDENTIFY,
+ .size_out = sizeof(id),
+ .payload_out = &id,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return rc;
+
+ mds->total_bytes =
+ le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER;
+ mds->volatile_only_bytes =
+ le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER;
+ mds->persistent_only_bytes =
+ le64_to_cpu(id.persistent_capacity) * CXL_CAPACITY_MULTIPLIER;
+ mds->partition_align_bytes =
+ le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER;
+
+ mds->lsa_size = le32_to_cpu(id.lsa_size);
+ memcpy(mds->firmware_version, id.fw_revision,
+ sizeof(id.fw_revision));
+
+ if (test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) {
+ val = get_unaligned_le24(id.poison_list_max_mer);
+ mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, "CXL");
+
+static int __cxl_mem_sanitize(struct cxl_memdev_state *mds, u16 cmd)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ int rc;
+ u32 sec_out = 0;
+ struct cxl_get_security_output {
+ __le32 flags;
+ } out;
+ struct cxl_mbox_cmd sec_cmd = {
+ .opcode = CXL_MBOX_OP_GET_SECURITY_STATE,
+ .payload_out = &out,
+ .size_out = sizeof(out),
+ };
+ struct cxl_mbox_cmd mbox_cmd = { .opcode = cmd };
+
+ if (cmd != CXL_MBOX_OP_SANITIZE && cmd != CXL_MBOX_OP_SECURE_ERASE)
+ return -EINVAL;
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &sec_cmd);
+ if (rc < 0) {
+ dev_err(cxl_mbox->host, "Failed to get security state : %d", rc);
+ return rc;
+ }
+
+ /*
+ * Prior to using these commands, any security applied to
+ * the user data areas of the device shall be DISABLED (or
+ * UNLOCKED for secure erase case).
+ */
+ sec_out = le32_to_cpu(out.flags);
+ if (sec_out & CXL_PMEM_SEC_STATE_USER_PASS_SET)
+ return -EINVAL;
+
+ if (cmd == CXL_MBOX_OP_SECURE_ERASE &&
+ sec_out & CXL_PMEM_SEC_STATE_LOCKED)
+ return -EINVAL;
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0) {
+ dev_err(cxl_mbox->host, "Failed to sanitize device : %d", rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+
+/**
+ * cxl_mem_sanitize() - Send a sanitization command to the device.
+ * @cxlmd: The device for the operation
+ * @cmd: The specific sanitization command opcode
+ *
+ * Return: 0 if the command was executed successfully, regardless of
+ * whether or not the actual security operation is done in the background,
+ * such as for the Sanitize case.
+ * Error return values can be the result of the mailbox command, -EINVAL
+ * when security requirements are not met or invalid contexts, or -EBUSY
+ * if the sanitize operation is already in flight.
+ *
+ * See CXL 3.0 @8.2.9.8.5.1 Sanitize and @8.2.9.8.5.2 Secure Erase.
+ */
+int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
+{
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_port *endpoint;
+
+ /* synchronize with cxl_mem_probe() and decoder write operations */
+ guard(device)(&cxlmd->dev);
+ endpoint = cxlmd->endpoint;
+ guard(rwsem_read)(&cxl_rwsem.region);
+ /*
+ * Require an endpoint to be safe otherwise the driver can not
+ * be sure that the device is unmapped.
+ */
+ if (endpoint && cxl_num_decoders_committed(endpoint) == 0)
+ return __cxl_mem_sanitize(mds, cmd);
+
+ return -EBUSY;
+}
+
+static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
+{
+ int i = info->nr_partitions;
+
+ if (size == 0)
+ return;
+
+ info->part[i].range = (struct range) {
+ .start = start,
+ .end = start + size - 1,
+ };
+ info->part[i].mode = mode;
+ info->nr_partitions++;
+}
+
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
+{
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct device *dev = cxlds->dev;
+ int rc;
+
+ if (!cxlds->media_ready) {
+ info->size = 0;
+ return 0;
+ }
+
+ info->size = mds->total_bytes;
+
+ if (mds->partition_align_bytes == 0) {
+ add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->volatile_only_bytes,
+ mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
+ return 0;
+ }
+
+ rc = cxl_mem_get_partition_info(mds);
+ if (rc) {
+ dev_err(dev, "Failed to query partition information\n");
+ return rc;
+ }
+
+ add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
+ add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
+ CXL_PARTMODE_PMEM);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
+
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_health_info_out hi;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_HEALTH_INFO,
+ .size_out = sizeof(hi),
+ .payload_out = &hi,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (!rc)
+ *count = le32_to_cpu(hi.dirty_shutdown_cnt);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL");
+
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_mbox_set_shutdown_state_in in = {
+ .state = 1
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
+ .size_in = sizeof(in),
+ .payload_in = &in,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL");
+
+int cxl_set_timestamp(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_mbox_set_timestamp_in pi;
+ int rc;
+
+ pi.timestamp = cpu_to_le64(ktime_get_real_ns());
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_TIMESTAMP,
+ .size_in = sizeof(pi),
+ .payload_in = &pi,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ /*
+ * Command is optional. Devices may have another way of providing
+ * a timestamp, or may return all 0s in timestamp fields.
+ * Don't report an error if this command isn't supported
+ */
+ if (rc && (mbox_cmd.return_code != CXL_MBOX_CMD_RC_UNSUPPORTED))
+ return rc;
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, "CXL");
+
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+ struct cxl_region *cxlr)
+{
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_poison_out *po;
+ struct cxl_mbox_poison_in pi;
+ int nr_records = 0;
+ int rc;
+
+ ACQUIRE(mutex_intr, lock)(&mds->poison.mutex);
+ if ((rc = ACQUIRE_ERR(mutex_intr, &lock)))
+ return rc;
+
+ po = mds->poison.list_out;
+ pi.offset = cpu_to_le64(offset);
+ pi.length = cpu_to_le64(len / CXL_POISON_LEN_MULT);
+
+ do {
+ struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd){
+ .opcode = CXL_MBOX_OP_GET_POISON,
+ .size_in = sizeof(pi),
+ .payload_in = &pi,
+ .size_out = cxl_mbox->payload_size,
+ .payload_out = po,
+ .min_out = struct_size(po, record, 0),
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ break;
+
+ for (int i = 0; i < le16_to_cpu(po->count); i++)
+ trace_cxl_poison(cxlmd, cxlr, &po->record[i],
+ po->flags, po->overflow_ts,
+ CXL_POISON_TRACE_LIST);
+
+ /* Protect against an uncleared _FLAG_MORE */
+ nr_records = nr_records + le16_to_cpu(po->count);
+ if (nr_records >= mds->poison.max_errors) {
+ dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n",
+ nr_records);
+ break;
+ }
+ } while (po->flags & CXL_POISON_FLAG_MORE);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, "CXL");
+
+static void free_poison_buf(void *buf)
+{
+ kvfree(buf);
+}
+
+/* Get Poison List output buffer is protected by mds->poison.lock */
+static int cxl_poison_alloc_buf(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+ mds->poison.list_out = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!mds->poison.list_out)
+ return -ENOMEM;
+
+ return devm_add_action_or_reset(mds->cxlds.dev, free_poison_buf,
+ mds->poison.list_out);
+}
+
+int cxl_poison_state_init(struct cxl_memdev_state *mds)
+{
+ int rc;
+
+ if (!test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds))
+ return 0;
+
+ rc = cxl_poison_alloc_buf(mds);
+ if (rc) {
+ clear_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds);
+ return rc;
+ }
+
+ mutex_init(&mds->poison.mutex);
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, "CXL");
+
+int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host)
+{
+ if (!cxl_mbox || !host)
+ return -EINVAL;
+
+ cxl_mbox->host = host;
+ mutex_init(&cxl_mbox->mbox_mutex);
+ rcuwait_init(&cxl_mbox->mbox_wait);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL");
+
+struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev)
+{
+ struct cxl_memdev_state *mds;
+ int rc;
+
+ mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL);
+ if (!mds) {
+ dev_err(dev, "No memory available\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_init(&mds->event.log_lock);
+ mds->cxlds.dev = dev;
+ mds->cxlds.reg_map.host = dev;
+ mds->cxlds.cxl_mbox.host = dev;
+ mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE;
+ mds->cxlds.type = CXL_DEVTYPE_CLASSMEM;
+
+ rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier);
+ if (rc == -EOPNOTSUPP)
+ dev_warn(dev, "CXL MCE unsupported\n");
+ else if (rc)
+ return ERR_PTR(rc);
+
+ return mds;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, "CXL");
+
+void __init cxl_mbox_init(void)
+{
+ struct dentry *mbox_debugfs;
+
+ mbox_debugfs = cxl_debugfs_create_dir("mbox");
+ debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
+ &cxl_raw_allow_all);
+}
diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c
new file mode 100644
index 000000000000..ff8d078c6ca1
--- /dev/null
+++ b/drivers/cxl/core/mce.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#include <linux/mm.h>
+#include <linux/notifier.h>
+#include <linux/set_memory.h>
+#include <asm/mce.h>
+#include <cxlmem.h>
+#include "mce.h"
+
+static int cxl_handle_mce(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state,
+ mce_notifier);
+ struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+ struct cxl_port *endpoint = cxlmd->endpoint;
+ struct mce *mce = data;
+ u64 spa, spa_alias;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (!endpoint)
+ return NOTIFY_DONE;
+
+ spa = mce->addr & MCI_ADDR_PHYSADDR;
+
+ pfn = spa >> PAGE_SHIFT;
+ if (!pfn_valid(pfn))
+ return NOTIFY_DONE;
+
+ spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa);
+ if (spa_alias == ~0ULL)
+ return NOTIFY_DONE;
+
+ pfn = spa_alias >> PAGE_SHIFT;
+
+ /*
+ * Take down the aliased memory page. The original memory page flagged
+ * by the MCE will be taken cared of by the standard MCE handler.
+ */
+ dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n",
+ spa_alias);
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+
+ return NOTIFY_OK;
+}
+
+static void cxl_unregister_mce_notifier(void *mce_notifier)
+{
+ mce_unregister_decode_chain(mce_notifier);
+}
+
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ mce_notifier->notifier_call = cxl_handle_mce;
+ mce_notifier->priority = MCE_PRIO_UC;
+ mce_register_decode_chain(mce_notifier);
+
+ return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier,
+ mce_notifier);
+}
diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h
new file mode 100644
index 000000000000..ca272e8db6c7
--- /dev/null
+++ b/drivers/cxl/core/mce.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation. All rights reserved. */
+#ifndef _CXL_CORE_MCE_H_
+#define _CXL_CORE_MCE_H_
+
+#include <linux/notifier.h>
+
+#ifdef CONFIG_CXL_MCE
+int devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier);
+#else
+static inline int
+devm_cxl_register_mce_notifier(struct device *dev,
+ struct notifier_block *mce_notifier)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
new file mode 100644
index 000000000000..e370d733e440
--- /dev/null
+++ b/drivers/cxl/core/memdev.c
@@ -0,0 +1,1161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/firmware.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <cxlmem.h>
+#include "trace.h"
+#include "core.h"
+
+static DECLARE_RWSEM(cxl_memdev_rwsem);
+
+/*
+ * An entire PCI topology full of devices should be enough for any
+ * config
+ */
+#define CXL_MEM_MAX_DEVS 65536
+
+static int cxl_mem_major;
+static DEFINE_IDA(cxl_memdev_ida);
+
+static void cxl_memdev_release(struct device *dev)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+ ida_free(&cxl_memdev_ida, cxlmd->id);
+ devm_cxl_memdev_edac_release(cxlmd);
+ kfree(cxlmd);
+}
+
+static char *cxl_memdev_devnode(const struct device *dev, umode_t *mode, kuid_t *uid,
+ kgid_t *gid)
+{
+ return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev));
+}
+
+static ssize_t firmware_version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+ if (!mds)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%.16s\n", mds->firmware_version);
+}
+static DEVICE_ATTR_RO(firmware_version);
+
+static ssize_t payload_max_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+ if (!mds)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%zu\n", cxlds->cxl_mbox.payload_size);
+}
+static DEVICE_ATTR_RO(payload_max);
+
+static ssize_t label_storage_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+ if (!mds)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%zu\n", mds->lsa_size);
+}
+static DEVICE_ATTR_RO(label_storage_size);
+
+static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds)
+{
+ /* Static RAM is only expected at partition 0. */
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return 0;
+ return resource_size(&cxlds->part[0].res);
+}
+
+static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ unsigned long long len = cxl_ram_size(cxlds);
+
+ return sysfs_emit(buf, "%#llx\n", len);
+}
+
+static struct device_attribute dev_attr_ram_size =
+ __ATTR(size, 0444, ram_size_show, NULL);
+
+static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ unsigned long long len = cxl_pmem_size(cxlds);
+
+ return sysfs_emit(buf, "%#llx\n", len);
+}
+
+static struct device_attribute dev_attr_pmem_size =
+ __ATTR(size, 0444, pmem_size_show, NULL);
+
+static ssize_t serial_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ return sysfs_emit(buf, "%#llx\n", cxlds->serial);
+}
+static DEVICE_ATTR_RO(serial);
+
+static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%d\n", dev_to_node(dev));
+}
+static DEVICE_ATTR_RO(numa_node);
+
+static ssize_t security_state_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ unsigned long state = mds->security.state;
+ int rc = 0;
+
+ /* sync with latest submission state */
+ mutex_lock(&cxl_mbox->mbox_mutex);
+ if (mds->security.sanitize_active)
+ rc = sysfs_emit(buf, "sanitize\n");
+ mutex_unlock(&cxl_mbox->mbox_mutex);
+ if (rc)
+ return rc;
+
+ if (!(state & CXL_PMEM_SEC_STATE_USER_PASS_SET))
+ return sysfs_emit(buf, "disabled\n");
+ if (state & CXL_PMEM_SEC_STATE_FROZEN ||
+ state & CXL_PMEM_SEC_STATE_MASTER_PLIMIT ||
+ state & CXL_PMEM_SEC_STATE_USER_PLIMIT)
+ return sysfs_emit(buf, "frozen\n");
+ if (state & CXL_PMEM_SEC_STATE_LOCKED)
+ return sysfs_emit(buf, "locked\n");
+
+ return sysfs_emit(buf, "unlocked\n");
+}
+static struct device_attribute dev_attr_security_state =
+ __ATTR(state, 0444, security_state_show, NULL);
+
+static ssize_t security_sanitize_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ bool sanitize;
+ ssize_t rc;
+
+ if (kstrtobool(buf, &sanitize) || !sanitize)
+ return -EINVAL;
+
+ rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SANITIZE);
+ if (rc)
+ return rc;
+
+ return len;
+}
+static struct device_attribute dev_attr_security_sanitize =
+ __ATTR(sanitize, 0200, NULL, security_sanitize_store);
+
+static ssize_t security_erase_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ ssize_t rc;
+ bool erase;
+
+ if (kstrtobool(buf, &erase) || !erase)
+ return -EINVAL;
+
+ rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SECURE_ERASE);
+ if (rc)
+ return rc;
+
+ return len;
+}
+static struct device_attribute dev_attr_security_erase =
+ __ATTR(erase, 0200, NULL, security_erase_store);
+
+bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd,
+ enum poison_cmd_enabled_bits cmd)
+{
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+ return test_bit(cmd, mds->poison.enabled_cmds);
+}
+
+static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ u64 offset, length;
+ int rc = 0;
+
+ /* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */
+ for (int i = 0; i < cxlds->nr_partitions; i++) {
+ const struct resource *res = &cxlds->part[i].res;
+
+ offset = res->start;
+ length = resource_size(res);
+ rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ /*
+ * Invalid Physical Address is not an error for
+ * volatile addresses. Device support is optional.
+ */
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
+ rc = 0;
+ }
+ return rc;
+}
+
+int cxl_trigger_poison_list(struct cxl_memdev *cxlmd)
+{
+ struct cxl_port *port;
+ int rc;
+
+ port = cxlmd->endpoint;
+ if (!port || !is_cxl_endpoint(port))
+ return -EINVAL;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ if (cxl_num_decoders_committed(port) == 0) {
+ /* No regions mapped to this memdev */
+ rc = cxl_get_poison_by_memdev(cxlmd);
+ } else {
+ /* Regions mapped, collect poison by endpoint */
+ rc = cxl_get_poison_by_endpoint(port);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_trigger_poison_list, "CXL");
+
+static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_FS))
+ return 0;
+
+ if (!resource_size(&cxlds->dpa_res)) {
+ dev_dbg(cxlds->dev, "device has no dpa resource\n");
+ return -EINVAL;
+ }
+ if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa)) {
+ dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n",
+ dpa, &cxlds->dpa_res);
+ return -EINVAL;
+ }
+ if (!IS_ALIGNED(dpa, 64)) {
+ dev_dbg(cxlds->dev, "dpa:0x%llx is not 64-byte aligned\n", dpa);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int cxl_inject_poison_locked(struct cxl_memdev *cxlmd, u64 dpa)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_inject_poison inject;
+ struct cxl_poison_record record;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_region *cxlr;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_FS))
+ return 0;
+
+ lockdep_assert_held(&cxl_rwsem.dpa);
+ lockdep_assert_held(&cxl_rwsem.region);
+
+ rc = cxl_validate_poison_dpa(cxlmd, dpa);
+ if (rc)
+ return rc;
+
+ inject.address = cpu_to_le64(dpa);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_INJECT_POISON,
+ .size_in = sizeof(inject),
+ .payload_in = &inject,
+ };
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ return rc;
+
+ cxlr = cxl_dpa_to_region(cxlmd, dpa);
+ if (cxlr)
+ dev_warn_once(cxl_mbox->host,
+ "poison inject dpa:%#llx region: %s\n", dpa,
+ dev_name(&cxlr->dev));
+
+ record = (struct cxl_poison_record) {
+ .address = cpu_to_le64(dpa),
+ .length = cpu_to_le32(1),
+ };
+ trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_INJECT);
+
+ return 0;
+}
+
+int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
+{
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ return cxl_inject_poison_locked(cxlmd, dpa);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, "CXL");
+
+int cxl_clear_poison_locked(struct cxl_memdev *cxlmd, u64 dpa)
+{
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_clear_poison clear;
+ struct cxl_poison_record record;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_region *cxlr;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_FS))
+ return 0;
+
+ lockdep_assert_held(&cxl_rwsem.dpa);
+ lockdep_assert_held(&cxl_rwsem.region);
+
+ rc = cxl_validate_poison_dpa(cxlmd, dpa);
+ if (rc)
+ return rc;
+
+ /*
+ * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command
+ * is defined to accept 64 bytes of write-data, along with the
+ * address to clear. This driver uses zeroes as write-data.
+ */
+ clear = (struct cxl_mbox_clear_poison) {
+ .address = cpu_to_le64(dpa)
+ };
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_CLEAR_POISON,
+ .size_in = sizeof(clear),
+ .payload_in = &clear,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc)
+ return rc;
+
+ cxlr = cxl_dpa_to_region(cxlmd, dpa);
+ if (cxlr)
+ dev_warn_once(cxl_mbox->host,
+ "poison clear dpa:%#llx region: %s\n", dpa,
+ dev_name(&cxlr->dev));
+
+ record = (struct cxl_poison_record) {
+ .address = cpu_to_le64(dpa),
+ .length = cpu_to_le32(1),
+ };
+ trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_CLEAR);
+
+ return 0;
+}
+
+int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa)
+{
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ return cxl_clear_poison_locked(cxlmd, dpa);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, "CXL");
+
+static struct attribute *cxl_memdev_attributes[] = {
+ &dev_attr_serial.attr,
+ &dev_attr_firmware_version.attr,
+ &dev_attr_payload_max.attr,
+ &dev_attr_label_storage_size.attr,
+ &dev_attr_numa_node.attr,
+ NULL,
+};
+
+static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds)
+{
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return &cxlds->part[i].perf;
+ return NULL;
+}
+
+static ssize_t pmem_qos_class_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class);
+}
+
+static struct device_attribute dev_attr_pmem_qos_class =
+ __ATTR(qos_class, 0444, pmem_qos_class_show, NULL);
+
+static struct attribute *cxl_memdev_pmem_attributes[] = {
+ &dev_attr_pmem_size.attr,
+ &dev_attr_pmem_qos_class.attr,
+ NULL,
+};
+
+static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds)
+{
+ if (cxlds->part[0].mode != CXL_PARTMODE_RAM)
+ return NULL;
+ return &cxlds->part[0].perf;
+}
+
+static ssize_t ram_qos_class_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class);
+}
+
+static struct device_attribute dev_attr_ram_qos_class =
+ __ATTR(qos_class, 0444, ram_qos_class_show, NULL);
+
+static struct attribute *cxl_memdev_ram_attributes[] = {
+ &dev_attr_ram_size.attr,
+ &dev_attr_ram_qos_class.attr,
+ NULL,
+};
+
+static struct attribute *cxl_memdev_security_attributes[] = {
+ &dev_attr_security_state.attr,
+ &dev_attr_security_sanitize.attr,
+ &dev_attr_security_erase.attr,
+ NULL,
+};
+
+static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr)
+ return 0;
+ return a->mode;
+}
+
+static struct attribute_group cxl_memdev_attribute_group = {
+ .attrs = cxl_memdev_attributes,
+ .is_visible = cxl_memdev_visible,
+};
+
+static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds);
+
+ if (a == &dev_attr_ram_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group cxl_memdev_ram_attribute_group = {
+ .name = "ram",
+ .attrs = cxl_memdev_ram_attributes,
+ .is_visible = cxl_ram_visible,
+};
+
+static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds);
+
+ if (a == &dev_attr_pmem_qos_class.attr &&
+ (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID))
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group cxl_memdev_pmem_attribute_group = {
+ .name = "pmem",
+ .attrs = cxl_memdev_pmem_attributes,
+ .is_visible = cxl_pmem_visible,
+};
+
+static umode_t cxl_memdev_security_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+ if (a == &dev_attr_security_sanitize.attr &&
+ !test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds))
+ return 0;
+
+ if (a == &dev_attr_security_erase.attr &&
+ !test_bit(CXL_SEC_ENABLED_SECURE_ERASE, mds->security.enabled_cmds))
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group cxl_memdev_security_attribute_group = {
+ .name = "security",
+ .attrs = cxl_memdev_security_attributes,
+ .is_visible = cxl_memdev_security_visible,
+};
+
+static const struct attribute_group *cxl_memdev_attribute_groups[] = {
+ &cxl_memdev_attribute_group,
+ &cxl_memdev_ram_attribute_group,
+ &cxl_memdev_pmem_attribute_group,
+ &cxl_memdev_security_attribute_group,
+ NULL,
+};
+
+void cxl_memdev_update_perf(struct cxl_memdev *cxlmd)
+{
+ sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_ram_attribute_group);
+ sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_pmem_attribute_group);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_memdev_update_perf, "CXL");
+
+static const struct device_type cxl_memdev_type = {
+ .name = "cxl_memdev",
+ .release = cxl_memdev_release,
+ .devnode = cxl_memdev_devnode,
+ .groups = cxl_memdev_attribute_groups,
+};
+
+bool is_cxl_memdev(const struct device *dev)
+{
+ return dev->type == &cxl_memdev_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL");
+
+/**
+ * set_exclusive_cxl_commands() - atomically disable user cxl commands
+ * @mds: The device state to operate on
+ * @cmds: bitmap of commands to mark exclusive
+ *
+ * Grab the cxl_memdev_rwsem in write mode to flush in-flight
+ * invocations of the ioctl path and then disable future execution of
+ * commands with the command ids set in @cmds.
+ */
+void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+ unsigned long *cmds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+ guard(rwsem_write)(&cxl_memdev_rwsem);
+ bitmap_or(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
+ cmds, CXL_MEM_COMMAND_ID_MAX);
+}
+EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL");
+
+/**
+ * clear_exclusive_cxl_commands() - atomically enable user cxl commands
+ * @mds: The device state to modify
+ * @cmds: bitmap of commands to mark available for userspace
+ */
+void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+ unsigned long *cmds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+ guard(rwsem_write)(&cxl_memdev_rwsem);
+ bitmap_andnot(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds,
+ cmds, CXL_MEM_COMMAND_ID_MAX);
+}
+EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL");
+
+static void cxl_memdev_shutdown(struct device *dev)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+ guard(rwsem_write)(&cxl_memdev_rwsem);
+ cxlmd->cxlds = NULL;
+}
+
+static void cxl_memdev_unregister(void *_cxlmd)
+{
+ struct cxl_memdev *cxlmd = _cxlmd;
+ struct device *dev = &cxlmd->dev;
+
+ cdev_device_del(&cxlmd->cdev, dev);
+ cxl_memdev_shutdown(dev);
+ put_device(dev);
+}
+
+static void detach_memdev(struct work_struct *work)
+{
+ struct cxl_memdev *cxlmd;
+
+ cxlmd = container_of(work, typeof(*cxlmd), detach_work);
+ device_release_driver(&cxlmd->dev);
+ put_device(&cxlmd->dev);
+}
+
+static struct lock_class_key cxl_memdev_key;
+
+static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds,
+ const struct file_operations *fops)
+{
+ struct cxl_memdev *cxlmd;
+ struct device *dev;
+ struct cdev *cdev;
+ int rc;
+
+ cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL);
+ if (!cxlmd)
+ return ERR_PTR(-ENOMEM);
+
+ rc = ida_alloc_max(&cxl_memdev_ida, CXL_MEM_MAX_DEVS - 1, GFP_KERNEL);
+ if (rc < 0)
+ goto err;
+ cxlmd->id = rc;
+ cxlmd->depth = -1;
+
+ dev = &cxlmd->dev;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_memdev_key);
+ dev->parent = cxlds->dev;
+ dev->bus = &cxl_bus_type;
+ dev->devt = MKDEV(cxl_mem_major, cxlmd->id);
+ dev->type = &cxl_memdev_type;
+ device_set_pm_not_required(dev);
+ INIT_WORK(&cxlmd->detach_work, detach_memdev);
+
+ cdev = &cxlmd->cdev;
+ cdev_init(cdev, fops);
+ return cxlmd;
+
+err:
+ kfree(cxlmd);
+ return ERR_PTR(rc);
+}
+
+static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
+ unsigned long arg)
+{
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+ switch (cmd) {
+ case CXL_MEM_QUERY_COMMANDS:
+ return cxl_query_cmd(cxl_mbox, (void __user *)arg);
+ case CXL_MEM_SEND_COMMAND:
+ return cxl_send_cmd(cxl_mbox, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct cxl_memdev *cxlmd = file->private_data;
+ struct cxl_dev_state *cxlds;
+
+ guard(rwsem_read)(&cxl_memdev_rwsem);
+ cxlds = cxlmd->cxlds;
+ if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM)
+ return __cxl_memdev_ioctl(cxlmd, cmd, arg);
+
+ return -ENXIO;
+}
+
+static int cxl_memdev_open(struct inode *inode, struct file *file)
+{
+ struct cxl_memdev *cxlmd =
+ container_of(inode->i_cdev, typeof(*cxlmd), cdev);
+
+ get_device(&cxlmd->dev);
+ file->private_data = cxlmd;
+
+ return 0;
+}
+
+static int cxl_memdev_release_file(struct inode *inode, struct file *file)
+{
+ struct cxl_memdev *cxlmd =
+ container_of(inode->i_cdev, typeof(*cxlmd), cdev);
+
+ put_device(&cxlmd->dev);
+
+ return 0;
+}
+
+/**
+ * cxl_mem_get_fw_info - Get Firmware info
+ * @mds: The device data for the operation
+ *
+ * Retrieve firmware info for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.1 Get FW Info
+ */
+static int cxl_mem_get_fw_info(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_fw_info info;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_FW_INFO,
+ .size_out = sizeof(info),
+ .payload_out = &info,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return rc;
+
+ mds->fw.num_slots = info.num_slots;
+ mds->fw.cur_slot = FIELD_GET(CXL_FW_INFO_SLOT_INFO_CUR_MASK,
+ info.slot_info);
+
+ return 0;
+}
+
+/**
+ * cxl_mem_activate_fw - Activate Firmware
+ * @mds: The device data for the operation
+ * @slot: slot number to activate
+ *
+ * Activate firmware in a given slot for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.3 Activate FW
+ */
+static int cxl_mem_activate_fw(struct cxl_memdev_state *mds, int slot)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_activate_fw activate;
+ struct cxl_mbox_cmd mbox_cmd;
+
+ if (slot == 0 || slot > mds->fw.num_slots)
+ return -EINVAL;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_ACTIVATE_FW,
+ .size_in = sizeof(activate),
+ .payload_in = &activate,
+ };
+
+ /* Only offline activation supported for now */
+ activate.action = CXL_FW_ACTIVATE_OFFLINE;
+ activate.slot = slot;
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+/**
+ * cxl_mem_abort_fw_xfer - Abort an in-progress FW transfer
+ * @mds: The device data for the operation
+ *
+ * Abort an in-progress firmware transfer for the device specified.
+ *
+ * Return: 0 if no error: or the result of the mailbox command.
+ *
+ * See CXL-3.0 8.2.9.3.2 Transfer FW
+ */
+static int cxl_mem_abort_fw_xfer(struct cxl_memdev_state *mds)
+{
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_transfer_fw *transfer;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ transfer = kzalloc(struct_size(transfer, data, 0), GFP_KERNEL);
+ if (!transfer)
+ return -ENOMEM;
+
+ /* Set a 1s poll interval and a total wait time of 30s */
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_TRANSFER_FW,
+ .size_in = sizeof(*transfer),
+ .payload_in = transfer,
+ .poll_interval_ms = 1000,
+ .poll_count = 30,
+ };
+
+ transfer->action = CXL_FW_TRANSFER_ACTION_ABORT;
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ kfree(transfer);
+ return rc;
+}
+
+static void cxl_fw_cleanup(struct fw_upload *fwl)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+
+ mds->fw.next_slot = 0;
+}
+
+static int cxl_fw_do_cancel(struct fw_upload *fwl)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ int rc;
+
+ rc = cxl_mem_abort_fw_xfer(mds);
+ if (rc < 0)
+ dev_err(&cxlmd->dev, "Error aborting FW transfer: %d\n", rc);
+
+ return FW_UPLOAD_ERR_CANCELED;
+}
+
+static enum fw_upload_err cxl_fw_prepare(struct fw_upload *fwl, const u8 *data,
+ u32 size)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+ struct cxl_mbox_transfer_fw *transfer;
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+
+ if (!size)
+ return FW_UPLOAD_ERR_INVALID_SIZE;
+
+ mds->fw.oneshot = struct_size(transfer, data, size) <
+ cxl_mbox->payload_size;
+
+ if (cxl_mem_get_fw_info(mds))
+ return FW_UPLOAD_ERR_HW_ERROR;
+
+ /*
+ * So far no state has been changed, hence no other cleanup is
+ * necessary. Simply return the cancelled status.
+ */
+ if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+ return FW_UPLOAD_ERR_CANCELED;
+
+ return FW_UPLOAD_ERR_NONE;
+}
+
+static enum fw_upload_err cxl_fw_write(struct fw_upload *fwl, const u8 *data,
+ u32 offset, u32 size, u32 *written)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct cxl_mbox_transfer_fw *transfer;
+ struct cxl_mbox_cmd mbox_cmd;
+ u32 cur_size, remaining;
+ size_t size_in;
+ int rc;
+
+ *written = 0;
+
+ /* Offset has to be aligned to 128B (CXL-3.0 8.2.9.3.2 Table 8-57) */
+ if (!IS_ALIGNED(offset, CXL_FW_TRANSFER_ALIGNMENT)) {
+ dev_err(&cxlmd->dev,
+ "misaligned offset for FW transfer slice (%u)\n",
+ offset);
+ return FW_UPLOAD_ERR_RW_ERROR;
+ }
+
+ /*
+ * Pick transfer size based on mds->payload_size @size must bw 128-byte
+ * aligned, ->payload_size is a power of 2 starting at 256 bytes, and
+ * sizeof(*transfer) is 128. These constraints imply that @cur_size
+ * will always be 128b aligned.
+ */
+ cur_size = min_t(size_t, size, cxl_mbox->payload_size - sizeof(*transfer));
+
+ remaining = size - cur_size;
+ size_in = struct_size(transfer, data, cur_size);
+
+ if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+ return cxl_fw_do_cancel(fwl);
+
+ /*
+ * Slot numbers are 1-indexed
+ * cur_slot is the 0-indexed next_slot (i.e. 'cur_slot - 1 + 1')
+ * Check for rollover using modulo, and 1-index it by adding 1
+ */
+ mds->fw.next_slot = (mds->fw.cur_slot % mds->fw.num_slots) + 1;
+
+ /* Do the transfer via mailbox cmd */
+ transfer = kzalloc(size_in, GFP_KERNEL);
+ if (!transfer)
+ return FW_UPLOAD_ERR_RW_ERROR;
+
+ transfer->offset = cpu_to_le32(offset / CXL_FW_TRANSFER_ALIGNMENT);
+ memcpy(transfer->data, data + offset, cur_size);
+ if (mds->fw.oneshot) {
+ transfer->action = CXL_FW_TRANSFER_ACTION_FULL;
+ transfer->slot = mds->fw.next_slot;
+ } else {
+ if (offset == 0) {
+ transfer->action = CXL_FW_TRANSFER_ACTION_INITIATE;
+ } else if (remaining == 0) {
+ transfer->action = CXL_FW_TRANSFER_ACTION_END;
+ transfer->slot = mds->fw.next_slot;
+ } else {
+ transfer->action = CXL_FW_TRANSFER_ACTION_CONTINUE;
+ }
+ }
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_TRANSFER_FW,
+ .size_in = size_in,
+ .payload_in = transfer,
+ .poll_interval_ms = 1000,
+ .poll_count = 30,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0) {
+ rc = FW_UPLOAD_ERR_RW_ERROR;
+ goto out_free;
+ }
+
+ *written = cur_size;
+
+ /* Activate FW if oneshot or if the last slice was written */
+ if (mds->fw.oneshot || remaining == 0) {
+ dev_dbg(&cxlmd->dev, "Activating firmware slot: %d\n",
+ mds->fw.next_slot);
+ rc = cxl_mem_activate_fw(mds, mds->fw.next_slot);
+ if (rc < 0) {
+ dev_err(&cxlmd->dev, "Error activating firmware: %d\n",
+ rc);
+ rc = FW_UPLOAD_ERR_HW_ERROR;
+ goto out_free;
+ }
+ }
+
+ rc = FW_UPLOAD_ERR_NONE;
+
+out_free:
+ kfree(transfer);
+ return rc;
+}
+
+static enum fw_upload_err cxl_fw_poll_complete(struct fw_upload *fwl)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+
+ /*
+ * cxl_internal_send_cmd() handles background operations synchronously.
+ * No need to wait for completions here - any errors would've been
+ * reported and handled during the ->write() call(s).
+ * Just check if a cancel request was received, and return success.
+ */
+ if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state))
+ return cxl_fw_do_cancel(fwl);
+
+ return FW_UPLOAD_ERR_NONE;
+}
+
+static void cxl_fw_cancel(struct fw_upload *fwl)
+{
+ struct cxl_memdev_state *mds = fwl->dd_handle;
+
+ set_bit(CXL_FW_CANCEL, mds->fw.state);
+}
+
+static const struct fw_upload_ops cxl_memdev_fw_ops = {
+ .prepare = cxl_fw_prepare,
+ .write = cxl_fw_write,
+ .poll_complete = cxl_fw_poll_complete,
+ .cancel = cxl_fw_cancel,
+ .cleanup = cxl_fw_cleanup,
+};
+
+static void cxl_remove_fw_upload(void *fwl)
+{
+ firmware_upload_unregister(fwl);
+}
+
+int devm_cxl_setup_fw_upload(struct device *host, struct cxl_memdev_state *mds)
+{
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+ struct device *dev = &cxlds->cxlmd->dev;
+ struct fw_upload *fwl;
+
+ if (!test_bit(CXL_MEM_COMMAND_ID_GET_FW_INFO, cxl_mbox->enabled_cmds))
+ return 0;
+
+ fwl = firmware_upload_register(THIS_MODULE, dev, dev_name(dev),
+ &cxl_memdev_fw_ops, mds);
+ if (IS_ERR(fwl))
+ return PTR_ERR(fwl);
+ return devm_add_action_or_reset(host, cxl_remove_fw_upload, fwl);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fw_upload, "CXL");
+
+static const struct file_operations cxl_memdev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = cxl_memdev_ioctl,
+ .open = cxl_memdev_open,
+ .release = cxl_memdev_release_file,
+ .compat_ioctl = compat_ptr_ioctl,
+ .llseek = noop_llseek,
+};
+
+struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
+ struct cxl_dev_state *cxlds)
+{
+ struct cxl_memdev *cxlmd;
+ struct device *dev;
+ struct cdev *cdev;
+ int rc;
+
+ cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops);
+ if (IS_ERR(cxlmd))
+ return cxlmd;
+
+ dev = &cxlmd->dev;
+ rc = dev_set_name(dev, "mem%d", cxlmd->id);
+ if (rc)
+ goto err;
+
+ /*
+ * Activate ioctl operations, no cxl_memdev_rwsem manipulation
+ * needed as this is ordered with cdev_add() publishing the device.
+ */
+ cxlmd->cxlds = cxlds;
+ cxlds->cxlmd = cxlmd;
+
+ cdev = &cxlmd->cdev;
+ rc = cdev_device_add(cdev, dev);
+ if (rc)
+ goto err;
+
+ rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd);
+ if (rc)
+ return ERR_PTR(rc);
+ return cxlmd;
+
+err:
+ /*
+ * The cdev was briefly live, shutdown any ioctl operations that
+ * saw that state.
+ */
+ cxl_memdev_shutdown(dev);
+ put_device(dev);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL");
+
+static void sanitize_teardown_notifier(void *data)
+{
+ struct cxl_memdev_state *mds = data;
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct kernfs_node *state;
+
+ /*
+ * Prevent new irq triggered invocations of the workqueue and
+ * flush inflight invocations.
+ */
+ mutex_lock(&cxl_mbox->mbox_mutex);
+ state = mds->security.sanitize_node;
+ mds->security.sanitize_node = NULL;
+ mutex_unlock(&cxl_mbox->mbox_mutex);
+
+ cancel_delayed_work_sync(&mds->security.poll_dwork);
+ sysfs_put(state);
+}
+
+int devm_cxl_sanitize_setup_notifier(struct device *host,
+ struct cxl_memdev *cxlmd)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ struct kernfs_node *sec;
+
+ if (!test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds))
+ return 0;
+
+ /*
+ * Note, the expectation is that @cxlmd would have failed to be
+ * created if these sysfs_get_dirent calls fail.
+ */
+ sec = sysfs_get_dirent(cxlmd->dev.kobj.sd, "security");
+ if (!sec)
+ return -ENOENT;
+ mds->security.sanitize_node = sysfs_get_dirent(sec, "state");
+ sysfs_put(sec);
+ if (!mds->security.sanitize_node)
+ return -ENOENT;
+
+ return devm_add_action_or_reset(host, sanitize_teardown_notifier, mds);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_sanitize_setup_notifier, "CXL");
+
+__init int cxl_memdev_init(void)
+{
+ dev_t devt;
+ int rc;
+
+ rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl");
+ if (rc)
+ return rc;
+
+ cxl_mem_major = MAJOR(devt);
+
+ return 0;
+}
+
+void cxl_memdev_exit(void)
+{
+ unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
+}
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
new file mode 100644
index 000000000000..5b023a0178a4
--- /dev/null
+++ b/drivers/cxl/core/pci.c
@@ -0,0 +1,1189 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
+#include <linux/units.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/pci-doe.h>
+#include <linux/aer.h>
+#include <cxlpci.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+#include "trace.h"
+
+/**
+ * DOC: cxl core pci
+ *
+ * Compute Express Link protocols are layered on top of PCIe. CXL core provides
+ * a set of helpers for CXL interactions which occur via PCIe.
+ */
+
+static unsigned short media_ready_timeout = 60;
+module_param(media_ready_timeout, ushort, 0644);
+MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready");
+
+static int pci_get_port_num(struct pci_dev *pdev)
+{
+ u32 lnkcap;
+ int type;
+
+ type = pci_pcie_type(pdev);
+ if (type != PCI_EXP_TYPE_DOWNSTREAM && type != PCI_EXP_TYPE_ROOT_PORT)
+ return -EINVAL;
+
+ if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP,
+ &lnkcap))
+ return -ENXIO;
+
+ return FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap);
+}
+
+/**
+ * __devm_cxl_add_dport_by_dev - allocate a dport by dport device
+ * @port: cxl_port that hosts the dport
+ * @dport_dev: 'struct device' of the dport
+ *
+ * Returns the allocated dport on success or ERR_PTR() of -errno on error
+ */
+struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
+ struct device *dport_dev)
+{
+ struct cxl_register_map map;
+ struct pci_dev *pdev;
+ int port_num, rc;
+
+ if (!dev_is_pci(dport_dev))
+ return ERR_PTR(-EINVAL);
+
+ pdev = to_pci_dev(dport_dev);
+ port_num = pci_get_port_num(pdev);
+ if (port_num < 0)
+ return ERR_PTR(port_num);
+
+ rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+ if (rc)
+ return ERR_PTR(rc);
+
+ device_lock_assert(&port->dev);
+ return devm_cxl_add_dport(port, dport_dev, port_num, map.resource);
+}
+EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL");
+
+static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ bool valid = false;
+ int rc, i;
+ u32 temp;
+
+ if (id > CXL_DVSEC_RANGE_MAX)
+ return -EINVAL;
+
+ /* Check MEM INFO VALID bit first, give up after 1s */
+ i = 1;
+ do {
+ rc = pci_read_config_dword(pdev,
+ d + CXL_DVSEC_RANGE_SIZE_LOW(id),
+ &temp);
+ if (rc)
+ return rc;
+
+ valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp);
+ if (valid)
+ break;
+ msleep(1000);
+ } while (i--);
+
+ if (!valid) {
+ dev_err(&pdev->dev,
+ "Timeout awaiting memory range %d valid after 1s.\n",
+ id);
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ bool active = false;
+ int rc, i;
+ u32 temp;
+
+ if (id > CXL_DVSEC_RANGE_MAX)
+ return -EINVAL;
+
+ /* Check MEM ACTIVE bit, up to 60s timeout by default */
+ for (i = media_ready_timeout; i; i--) {
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp);
+ if (rc)
+ return rc;
+
+ active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp);
+ if (active)
+ break;
+ msleep(1000);
+ }
+
+ if (!active) {
+ dev_err(&pdev->dev,
+ "timeout awaiting memory active after %d seconds\n",
+ media_ready_timeout);
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+/*
+ * Wait up to @media_ready_timeout for the device to report memory
+ * active.
+ */
+int cxl_await_media_ready(struct cxl_dev_state *cxlds)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ int rc, i, hdm_count;
+ u64 md_status;
+ u16 cap;
+
+ rc = pci_read_config_word(pdev,
+ d + CXL_DVSEC_CAP_OFFSET, &cap);
+ if (rc)
+ return rc;
+
+ hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+ for (i = 0; i < hdm_count; i++) {
+ rc = cxl_dvsec_mem_range_valid(cxlds, i);
+ if (rc)
+ return rc;
+ }
+
+ for (i = 0; i < hdm_count; i++) {
+ rc = cxl_dvsec_mem_range_active(cxlds, i);
+ if (rc)
+ return rc;
+ }
+
+ md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+ if (!CXLMDEV_READY(md_status))
+ return -EIO;
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, "CXL");
+
+static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ u16 ctrl;
+ int rc;
+
+ rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+ if (rc < 0)
+ return rc;
+
+ if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val)
+ return 1;
+ ctrl &= ~CXL_DVSEC_MEM_ENABLE;
+ ctrl |= val;
+
+ rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl);
+ if (rc < 0)
+ return rc;
+
+ return 0;
+}
+
+static void clear_mem_enable(void *cxlds)
+{
+ cxl_set_mem_enable(cxlds, 0);
+}
+
+static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds)
+{
+ int rc;
+
+ rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE);
+ if (rc < 0)
+ return rc;
+ if (rc > 0)
+ return 0;
+ return devm_add_action_or_reset(host, clear_mem_enable, cxlds);
+}
+
+/* require dvsec ranges to be covered by a locked platform window */
+static int dvsec_range_allowed(struct device *dev, const void *arg)
+{
+ const struct range *dev_range = arg;
+ struct cxl_decoder *cxld;
+
+ if (!is_root_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+
+ if (!(cxld->flags & CXL_DECODER_F_RAM))
+ return 0;
+
+ return range_contains(&cxld->hpa_range, dev_range);
+}
+
+static void disable_hdm(void *_cxlhdm)
+{
+ u32 global_ctrl;
+ struct cxl_hdm *cxlhdm = _cxlhdm;
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+
+ global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+ writel(global_ctrl & ~CXL_HDM_DECODER_ENABLE,
+ hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+}
+
+static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm)
+{
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ u32 global_ctrl;
+
+ global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+ writel(global_ctrl | CXL_HDM_DECODER_ENABLE,
+ hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+
+ return devm_add_action_or_reset(host, disable_hdm, cxlhdm);
+}
+
+int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
+ struct cxl_endpoint_dvsec_info *info)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ struct device *dev = cxlds->dev;
+ int hdm_count, rc, i, ranges = 0;
+ int d = cxlds->cxl_dvsec;
+ u16 cap, ctrl;
+
+ if (!d) {
+ dev_dbg(dev, "No DVSEC Capability\n");
+ return -ENXIO;
+ }
+
+ rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap);
+ if (rc)
+ return rc;
+
+ if (!(cap & CXL_DVSEC_MEM_CAPABLE)) {
+ dev_dbg(dev, "Not MEM Capable\n");
+ return -ENXIO;
+ }
+
+ /*
+ * It is not allowed by spec for MEM.capable to be set and have 0 legacy
+ * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this
+ * driver is for a spec defined class code which must be CXL.mem
+ * capable, there is no point in continuing to enable CXL.mem.
+ */
+ hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+ if (!hdm_count || hdm_count > 2)
+ return -EINVAL;
+
+ /*
+ * The current DVSEC values are moot if the memory capability is
+ * disabled, and they will remain moot after the HDM Decoder
+ * capability is enabled.
+ */
+ rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
+ if (rc)
+ return rc;
+
+ info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl);
+ if (!info->mem_enabled)
+ return 0;
+
+ for (i = 0; i < hdm_count; i++) {
+ u64 base, size;
+ u32 temp;
+
+ rc = cxl_dvsec_mem_range_valid(cxlds, i);
+ if (rc)
+ return rc;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp);
+ if (rc)
+ return rc;
+
+ size = (u64)temp << 32;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp);
+ if (rc)
+ return rc;
+
+ size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK;
+ if (!size) {
+ continue;
+ }
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp);
+ if (rc)
+ return rc;
+
+ base = (u64)temp << 32;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp);
+ if (rc)
+ return rc;
+
+ base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK;
+
+ info->dvsec_range[ranges++] = (struct range) {
+ .start = base,
+ .end = base + size - 1
+ };
+ }
+
+ info->ranges = ranges;
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dvsec_rr_decode, "CXL");
+
+/**
+ * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint
+ * @cxlds: Device state
+ * @cxlhdm: Mapped HDM decoder Capability
+ * @info: Cached DVSEC range registers info
+ *
+ * Try to enable the endpoint's HDM Decoder Capability
+ */
+int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
+ struct cxl_endpoint_dvsec_info *info)
+{
+ void __iomem *hdm = cxlhdm->regs.hdm_decoder;
+ struct cxl_port *port = cxlhdm->port;
+ struct device *dev = cxlds->dev;
+ struct cxl_port *root;
+ int i, rc, allowed;
+ u32 global_ctrl = 0;
+
+ if (hdm)
+ global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET);
+
+ /*
+ * If the HDM Decoder Capability is already enabled then assume
+ * that some other agent like platform firmware set it up.
+ */
+ if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled))
+ return devm_cxl_enable_mem(&port->dev, cxlds);
+
+ /*
+ * If the HDM Decoder Capability does not exist and DVSEC was
+ * not setup, the DVSEC based emulation cannot be used.
+ */
+ if (!hdm)
+ return -ENODEV;
+
+ /* The HDM Decoder Capability exists but is globally disabled. */
+
+ /*
+ * If the DVSEC CXL Range registers are not enabled, just
+ * enable and use the HDM Decoder Capability registers.
+ */
+ if (!info->mem_enabled) {
+ rc = devm_cxl_enable_hdm(&port->dev, cxlhdm);
+ if (rc)
+ return rc;
+
+ return devm_cxl_enable_mem(&port->dev, cxlds);
+ }
+
+ /*
+ * Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base
+ * [High,Low] when HDM operation is enabled the range register values
+ * are ignored by the device, but the spec also recommends matching the
+ * DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges
+ * are expected even though Linux does not require or maintain that
+ * match. Check if at least one DVSEC range is enabled and allowed by
+ * the platform. That is, the DVSEC range must be covered by a locked
+ * platform window (CFMWS). Fail otherwise as the endpoint's decoders
+ * cannot be used.
+ */
+
+ root = to_cxl_port(port->dev.parent);
+ while (!is_cxl_root(root) && is_cxl_port(root->dev.parent))
+ root = to_cxl_port(root->dev.parent);
+ if (!is_cxl_root(root)) {
+ dev_err(dev, "Failed to acquire root port for HDM enable\n");
+ return -ENODEV;
+ }
+
+ for (i = 0, allowed = 0; i < info->ranges; i++) {
+ struct device *cxld_dev;
+
+ cxld_dev = device_find_child(&root->dev, &info->dvsec_range[i],
+ dvsec_range_allowed);
+ if (!cxld_dev) {
+ dev_dbg(dev, "DVSEC Range%d denied by platform\n", i);
+ continue;
+ }
+ dev_dbg(dev, "DVSEC Range%d allowed by platform\n", i);
+ put_device(cxld_dev);
+ allowed++;
+ }
+
+ if (!allowed) {
+ dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL");
+
+#define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff
+#define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0
+#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00
+#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA 0
+#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE 0xffff0000
+#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY 0xffff
+#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2
+
+#define CDAT_DOE_REQ(entry_handle) cpu_to_le32 \
+ (FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE, \
+ CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) | \
+ FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE, \
+ CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) | \
+ FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle)))
+
+static int cxl_cdat_get_length(struct device *dev,
+ struct pci_doe_mb *doe_mb,
+ size_t *length)
+{
+ __le32 request = CDAT_DOE_REQ(0);
+ __le32 response[2];
+ int rc;
+
+ rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL,
+ CXL_DOE_PROTOCOL_TABLE_ACCESS,
+ &request, sizeof(request),
+ &response, sizeof(response));
+ if (rc < 0) {
+ dev_err(dev, "DOE failed: %d", rc);
+ return rc;
+ }
+ if (rc < sizeof(response))
+ return -EIO;
+
+ *length = le32_to_cpu(response[1]);
+ dev_dbg(dev, "CDAT length %zu\n", *length);
+
+ return 0;
+}
+
+static int cxl_cdat_read_table(struct device *dev,
+ struct pci_doe_mb *doe_mb,
+ struct cdat_doe_rsp *rsp, size_t *length)
+{
+ size_t received, remaining = *length;
+ unsigned int entry_handle = 0;
+ union cdat_data *data;
+ __le32 saved_dw = 0;
+
+ do {
+ __le32 request = CDAT_DOE_REQ(entry_handle);
+ int rc;
+
+ rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL,
+ CXL_DOE_PROTOCOL_TABLE_ACCESS,
+ &request, sizeof(request),
+ rsp, sizeof(*rsp) + remaining);
+ if (rc < 0) {
+ dev_err(dev, "DOE failed: %d", rc);
+ return rc;
+ }
+
+ if (rc < sizeof(*rsp))
+ return -EIO;
+
+ data = (union cdat_data *)rsp->data;
+ received = rc - sizeof(*rsp);
+
+ if (entry_handle == 0) {
+ if (received != sizeof(data->header))
+ return -EIO;
+ } else {
+ if (received < sizeof(data->entry) ||
+ received != le16_to_cpu(data->entry.length))
+ return -EIO;
+ }
+
+ /* Get the CXL table access header entry handle */
+ entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE,
+ le32_to_cpu(rsp->doe_header));
+
+ /*
+ * Table Access Response Header overwrote the last DW of
+ * previous entry, so restore that DW
+ */
+ rsp->doe_header = saved_dw;
+ remaining -= received;
+ rsp = (void *)rsp + received;
+ saved_dw = rsp->doe_header;
+ } while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY);
+
+ /* Length in CDAT header may exceed concatenation of CDAT entries */
+ *length -= remaining;
+
+ return 0;
+}
+
+static unsigned char cdat_checksum(void *buf, size_t size)
+{
+ unsigned char sum, *data = buf;
+ size_t i;
+
+ for (sum = 0, i = 0; i < size; i++)
+ sum += data[i];
+ return sum;
+}
+
+/**
+ * read_cdat_data - Read the CDAT data on this port
+ * @port: Port to read data from
+ *
+ * This call will sleep waiting for responses from the DOE mailbox.
+ */
+void read_cdat_data(struct cxl_port *port)
+{
+ struct device *uport = port->uport_dev;
+ struct device *dev = &port->dev;
+ struct pci_doe_mb *doe_mb;
+ struct pci_dev *pdev = NULL;
+ struct cxl_memdev *cxlmd;
+ struct cdat_doe_rsp *buf;
+ size_t table_length, length;
+ int rc;
+
+ if (is_cxl_memdev(uport)) {
+ struct device *host;
+
+ cxlmd = to_cxl_memdev(uport);
+ host = cxlmd->dev.parent;
+ if (dev_is_pci(host))
+ pdev = to_pci_dev(host);
+ } else if (dev_is_pci(uport)) {
+ pdev = to_pci_dev(uport);
+ }
+
+ if (!pdev)
+ return;
+
+ doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_CXL,
+ CXL_DOE_PROTOCOL_TABLE_ACCESS);
+ if (!doe_mb) {
+ dev_dbg(dev, "No CDAT mailbox\n");
+ return;
+ }
+
+ port->cdat_available = true;
+
+ if (cxl_cdat_get_length(dev, doe_mb, &length)) {
+ dev_dbg(dev, "No CDAT length\n");
+ return;
+ }
+
+ /*
+ * The begin of the CDAT buffer needs space for additional 4
+ * bytes for the DOE header. Table data starts afterwards.
+ */
+ buf = devm_kzalloc(dev, sizeof(*buf) + length, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ table_length = length;
+
+ rc = cxl_cdat_read_table(dev, doe_mb, buf, &length);
+ if (rc)
+ goto err;
+
+ if (table_length != length)
+ dev_warn(dev, "Malformed CDAT table length (%zu:%zu), discarding trailing data\n",
+ table_length, length);
+
+ if (cdat_checksum(buf->data, length))
+ goto err;
+
+ port->cdat.table = buf->data;
+ port->cdat.length = length;
+
+ return;
+err:
+ /* Don't leave table data allocated on error */
+ devm_kfree(dev, buf);
+ dev_err(dev, "Failed to read/validate CDAT.\n");
+}
+EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL");
+
+static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds,
+ void __iomem *ras_base)
+{
+ void __iomem *addr;
+ u32 status;
+
+ if (!ras_base)
+ return;
+
+ addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+ status = readl(addr);
+ if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+ writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+ trace_cxl_aer_correctable_error(cxlds->cxlmd, status);
+ }
+}
+
+static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds)
+{
+ return __cxl_handle_cor_ras(cxlds, cxlds->regs.ras);
+}
+
+/* CXL spec rev3.0 8.2.4.16.1 */
+static void header_log_copy(void __iomem *ras_base, u32 *log)
+{
+ void __iomem *addr;
+ u32 *log_addr;
+ int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
+
+ addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET;
+ log_addr = log;
+
+ for (i = 0; i < log_u32_size; i++) {
+ *log_addr = readl(addr);
+ log_addr++;
+ addr += sizeof(u32);
+ }
+}
+
+/*
+ * Log the state of the RAS status registers and prepare them to log the
+ * next error status. Return 1 if reset needed.
+ */
+static bool __cxl_handle_ras(struct cxl_dev_state *cxlds,
+ void __iomem *ras_base)
+{
+ u32 hl[CXL_HEADERLOG_SIZE_U32];
+ void __iomem *addr;
+ u32 status;
+ u32 fe;
+
+ if (!ras_base)
+ return false;
+
+ addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+ status = readl(addr);
+ if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
+ return false;
+
+ /* If multiple errors, log header points to first error from ctrl reg */
+ if (hweight32(status) > 1) {
+ void __iomem *rcc_addr =
+ ras_base + CXL_RAS_CAP_CONTROL_OFFSET;
+
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ readl(rcc_addr)));
+ } else {
+ fe = status;
+ }
+
+ header_log_copy(ras_base, hl);
+ trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl);
+ writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+
+ return true;
+}
+
+static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
+{
+ return __cxl_handle_ras(cxlds, cxlds->regs.ras);
+}
+
+#ifdef CONFIG_PCIEAER_CXL
+
+static void cxl_dport_map_rch_aer(struct cxl_dport *dport)
+{
+ resource_size_t aer_phys;
+ struct device *host;
+ u16 aer_cap;
+
+ aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base);
+ if (aer_cap) {
+ host = dport->reg_map.host;
+ aer_phys = aer_cap + dport->rcrb.base;
+ dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys,
+ sizeof(struct aer_capability_regs));
+ }
+}
+
+static void cxl_dport_map_ras(struct cxl_dport *dport)
+{
+ struct cxl_register_map *map = &dport->reg_map;
+ struct device *dev = dport->dport_dev;
+
+ if (!map->component_map.ras.valid)
+ dev_dbg(dev, "RAS registers not found\n");
+ else if (cxl_map_component_regs(map, &dport->regs.component,
+ BIT(CXL_CM_CAP_CAP_ID_RAS)))
+ dev_dbg(dev, "Failed to map RAS capability.\n");
+}
+
+static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
+{
+ void __iomem *aer_base = dport->regs.dport_aer;
+ u32 aer_cmd_mask, aer_cmd;
+
+ if (!aer_base)
+ return;
+
+ /*
+ * Disable RCH root port command interrupts.
+ * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors
+ *
+ * This sequence may not be necessary. CXL spec states disabling
+ * the root cmd register's interrupts is required. But, PCI spec
+ * shows these are disabled by default on reset.
+ */
+ aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN |
+ PCI_ERR_ROOT_CMD_NONFATAL_EN |
+ PCI_ERR_ROOT_CMD_FATAL_EN);
+ aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND);
+ aer_cmd &= ~aer_cmd_mask;
+ writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
+}
+
+/**
+ * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport
+ * @dport: the cxl_dport that needs to be initialized
+ * @host: host device for devm operations
+ */
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host)
+{
+ dport->reg_map.host = host;
+ cxl_dport_map_ras(dport);
+
+ if (dport->rch) {
+ struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev);
+
+ if (!host_bridge->native_aer)
+ return;
+
+ cxl_dport_map_rch_aer(dport);
+ cxl_disable_rch_root_ints(dport);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
+
+static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds,
+ struct cxl_dport *dport)
+{
+ return __cxl_handle_cor_ras(cxlds, dport->regs.ras);
+}
+
+static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds,
+ struct cxl_dport *dport)
+{
+ return __cxl_handle_ras(cxlds, dport->regs.ras);
+}
+
+/*
+ * Copy the AER capability registers using 32 bit read accesses.
+ * This is necessary because RCRB AER capability is MMIO mapped. Clear the
+ * status after copying.
+ *
+ * @aer_base: base address of AER capability block in RCRB
+ * @aer_regs: destination for copying AER capability
+ */
+static bool cxl_rch_get_aer_info(void __iomem *aer_base,
+ struct aer_capability_regs *aer_regs)
+{
+ int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32);
+ u32 *aer_regs_buf = (u32 *)aer_regs;
+ int n;
+
+ if (!aer_base)
+ return false;
+
+ /* Use readl() to guarantee 32-bit accesses */
+ for (n = 0; n < read_cnt; n++)
+ aer_regs_buf[n] = readl(aer_base + n * sizeof(u32));
+
+ writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS);
+ writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS);
+
+ return true;
+}
+
+/* Get AER severity. Return false if there is no error. */
+static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs,
+ int *severity)
+{
+ if (aer_regs->uncor_status & ~aer_regs->uncor_mask) {
+ if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV)
+ *severity = AER_FATAL;
+ else
+ *severity = AER_NONFATAL;
+ return true;
+ }
+
+ if (aer_regs->cor_status & ~aer_regs->cor_mask) {
+ *severity = AER_CORRECTABLE;
+ return true;
+ }
+
+ return false;
+}
+
+static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ struct aer_capability_regs aer_regs;
+ struct cxl_dport *dport;
+ int severity;
+
+ struct cxl_port *port __free(put_cxl_port) =
+ cxl_pci_find_port(pdev, &dport);
+ if (!port)
+ return;
+
+ if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs))
+ return;
+
+ if (!cxl_rch_get_aer_severity(&aer_regs, &severity))
+ return;
+
+ pci_print_aer(pdev, severity, &aer_regs);
+
+ if (severity == AER_CORRECTABLE)
+ cxl_handle_rdport_cor_ras(cxlds, dport);
+ else
+ cxl_handle_rdport_ras(cxlds, dport);
+}
+
+#else
+static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
+#endif
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ struct device *dev = &cxlds->cxlmd->dev;
+
+ scoped_guard(device, dev) {
+ if (!dev->driver) {
+ dev_warn(&pdev->dev,
+ "%s: memdev disabled, abort error handling\n",
+ dev_name(dev));
+ return;
+ }
+
+ if (cxlds->rcd)
+ cxl_handle_rdport_errors(cxlds);
+
+ cxl_handle_endpoint_cor_ras(cxlds);
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct device *dev = &cxlmd->dev;
+ bool ue;
+
+ scoped_guard(device, dev) {
+ if (!dev->driver) {
+ dev_warn(&pdev->dev,
+ "%s: memdev disabled, abort error handling\n",
+ dev_name(dev));
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ if (cxlds->rcd)
+ cxl_handle_rdport_errors(cxlds);
+ /*
+ * A frozen channel indicates an impending reset which is fatal to
+ * CXL.mem operation, and will likely crash the system. On the off
+ * chance the situation is recoverable dump the status of the RAS
+ * capability registers and bounce the active state of the memdev.
+ */
+ ue = cxl_handle_endpoint_ras(cxlds);
+ }
+
+
+ switch (state) {
+ case pci_channel_io_normal:
+ if (ue) {
+ device_release_driver(dev);
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+ return PCI_ERS_RESULT_CAN_RECOVER;
+ case pci_channel_io_frozen:
+ dev_warn(&pdev->dev,
+ "%s: frozen state error detected, disable CXL.mem\n",
+ dev_name(dev));
+ device_release_driver(dev);
+ return PCI_ERS_RESULT_NEED_RESET;
+ case pci_channel_io_perm_failure:
+ dev_warn(&pdev->dev,
+ "failure state error detected, request disconnect\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
+
+static int cxl_flit_size(struct pci_dev *pdev)
+{
+ if (cxl_pci_flit_256(pdev))
+ return 256;
+
+ return 68;
+}
+
+/**
+ * cxl_pci_get_latency - calculate the link latency for the PCIe link
+ * @pdev: PCI device
+ *
+ * return: calculated latency or 0 for no latency
+ *
+ * CXL Memory Device SW Guide v1.0 2.11.4 Link latency calculation
+ * Link latency = LinkPropagationLatency + FlitLatency + RetimerLatency
+ * LinkProgationLatency is negligible, so 0 will be used
+ * RetimerLatency is assumed to be negligible and 0 will be used
+ * FlitLatency = FlitSize / LinkBandwidth
+ * FlitSize is defined by spec. CXL rev3.0 4.2.1.
+ * 68B flit is used up to 32GT/s. >32GT/s, 256B flit size is used.
+ * The FlitLatency is converted to picoseconds.
+ */
+long cxl_pci_get_latency(struct pci_dev *pdev)
+{
+ long bw;
+
+ bw = pcie_link_speed_mbps(pdev);
+ if (bw < 0)
+ return 0;
+ bw /= BITS_PER_BYTE;
+
+ return cxl_flit_size(pdev) * MEGA / bw;
+}
+
+static int __cxl_endpoint_decoder_reset_detected(struct device *dev, void *data)
+{
+ struct cxl_port *port = data;
+ struct cxl_decoder *cxld;
+ struct cxl_hdm *cxlhdm;
+ void __iomem *hdm;
+ u32 ctrl;
+
+ if (!is_endpoint_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
+ return 0;
+
+ cxlhdm = dev_get_drvdata(&port->dev);
+ hdm = cxlhdm->regs.hdm_decoder;
+ ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id));
+
+ return !FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl);
+}
+
+bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port)
+{
+ return device_for_each_child(&port->dev, port,
+ __cxl_endpoint_decoder_reset_detected);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL");
+
+int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
+{
+ int speed, bw;
+ u16 lnksta;
+ u32 width;
+
+ speed = pcie_link_speed_mbps(pdev);
+ if (speed < 0)
+ return speed;
+ speed /= BITS_PER_BYTE;
+
+ pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta);
+ width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta);
+ bw = speed * width;
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ c[i].read_bandwidth = bw;
+ c[i].write_bandwidth = bw;
+ }
+
+ return 0;
+}
+
+/*
+ * Set max timeout such that platforms will optimize GPF flow to avoid
+ * the implied worst-case scenario delays. On a sane platform, all
+ * devices should always complete GPF within the energy budget of
+ * the GPF flow. The kernel does not have enough information to pick
+ * anything better than "maximize timeouts and hope it works".
+ *
+ * A misbehaving device could block forward progress of GPF for all
+ * the other devices, exhausting the energy budget of the platform.
+ * However, the spec seems to assume that moving on from slow to respond
+ * devices is a virtue. It is not possible to know that, in actuality,
+ * the slow to respond device is *the* most critical device in the
+ * system to wait.
+ */
+#define GPF_TIMEOUT_BASE_MAX 2
+#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
+
+u16 cxl_gpf_get_dvsec(struct device *dev)
+{
+ struct pci_dev *pdev;
+ bool is_port = true;
+ u16 dvsec;
+
+ if (!dev_is_pci(dev))
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT)
+ is_port = false;
+
+ dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+ is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF);
+ if (!dvsec)
+ dev_warn(dev, "%s GPF DVSEC not present\n",
+ is_port ? "Port" : "Device");
+ return dvsec;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL");
+
+static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
+{
+ u64 base, scale;
+ int rc, offset;
+ u16 ctrl;
+
+ switch (phase) {
+ case 1:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
+ break;
+ case 2:
+ offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
+ base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
+ scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
+ if (rc)
+ return rc;
+
+ if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
+ FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
+ return 0;
+
+ ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
+ ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);
+
+ rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
+ if (!rc)
+ pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
+ phase, GPF_TIMEOUT_BASE_MAX);
+
+ return rc;
+}
+
+int cxl_gpf_port_setup(struct cxl_dport *dport)
+{
+ if (!dport)
+ return -EINVAL;
+
+ if (!dport->gpf_dvsec) {
+ struct pci_dev *pdev;
+ int dvsec;
+
+ dvsec = cxl_gpf_get_dvsec(dport->dport_dev);
+ if (!dvsec)
+ return -EINVAL;
+
+ dport->gpf_dvsec = dvsec;
+ pdev = to_pci_dev(dport->dport_dev);
+ update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 1);
+ update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 2);
+ }
+
+ return 0;
+}
+
+struct cxl_walk_context {
+ struct pci_bus *bus;
+ struct cxl_port *port;
+ int type;
+ int error;
+ int count;
+};
+
+static int count_dports(struct pci_dev *pdev, void *data)
+{
+ struct cxl_walk_context *ctx = data;
+ int type = pci_pcie_type(pdev);
+
+ if (pdev->bus != ctx->bus)
+ return 0;
+ if (!pci_is_pcie(pdev))
+ return 0;
+ if (type != ctx->type)
+ return 0;
+
+ ctx->count++;
+ return 0;
+}
+
+int cxl_port_get_possible_dports(struct cxl_port *port)
+{
+ struct pci_bus *bus = cxl_port_to_pci_bus(port);
+ struct cxl_walk_context ctx;
+ int type;
+
+ if (!bus) {
+ dev_err(&port->dev, "No PCI bus found for port %s\n",
+ dev_name(&port->dev));
+ return -ENXIO;
+ }
+
+ if (pci_is_root_bus(bus))
+ type = PCI_EXP_TYPE_ROOT_PORT;
+ else
+ type = PCI_EXP_TYPE_DOWNSTREAM;
+
+ ctx = (struct cxl_walk_context) {
+ .bus = bus,
+ .type = type,
+ };
+ pci_walk_bus(bus, count_dports, &ctx);
+
+ return ctx.count;
+}
diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
new file mode 100644
index 000000000000..8853415c106a
--- /dev/null
+++ b/drivers/cxl/core/pmem.c
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl pmem
+ *
+ * The core CXL PMEM infrastructure supports persistent memory
+ * provisioning and serves as a bridge to the LIBNVDIMM subsystem. A CXL
+ * 'bridge' device is added at the root of a CXL device topology if
+ * platform firmware advertises at least one persistent memory capable
+ * CXL window. That root-level bridge corresponds to a LIBNVDIMM 'bus'
+ * device. Then for each cxl_memdev in the CXL device topology a bridge
+ * device is added to host a LIBNVDIMM dimm object. When these bridges
+ * are registered native LIBNVDIMM uapis are translated to CXL
+ * operations, for example, namespace label access commands.
+ */
+
+static DEFINE_IDA(cxl_nvdimm_bridge_ida);
+
+static void cxl_nvdimm_bridge_release(struct device *dev)
+{
+ struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
+
+ ida_free(&cxl_nvdimm_bridge_ida, cxl_nvb->id);
+ kfree(cxl_nvb);
+}
+
+static const struct attribute_group *cxl_nvdimm_bridge_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+const struct device_type cxl_nvdimm_bridge_type = {
+ .name = "cxl_nvdimm_bridge",
+ .release = cxl_nvdimm_bridge_release,
+ .groups = cxl_nvdimm_bridge_attribute_groups,
+};
+
+struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, dev->type != &cxl_nvdimm_bridge_type,
+ "not a cxl_nvdimm_bridge device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_nvdimm_bridge, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, "CXL");
+
+/**
+ * cxl_find_nvdimm_bridge() - find a bridge device relative to a port
+ * @port: any descendant port of an nvdimm-bridge associated
+ * root-cxl-port
+ */
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port)
+{
+ struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
+ struct device *dev;
+
+ if (!cxl_root)
+ return NULL;
+
+ dev = device_find_child(&cxl_root->port.dev,
+ &cxl_nvdimm_bridge_type,
+ device_match_type);
+
+ if (!dev)
+ return NULL;
+
+ return to_cxl_nvdimm_bridge(dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, "CXL");
+
+static struct lock_class_key cxl_nvdimm_bridge_key;
+
+static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port)
+{
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct device *dev;
+ int rc;
+
+ cxl_nvb = kzalloc(sizeof(*cxl_nvb), GFP_KERNEL);
+ if (!cxl_nvb)
+ return ERR_PTR(-ENOMEM);
+
+ rc = ida_alloc(&cxl_nvdimm_bridge_ida, GFP_KERNEL);
+ if (rc < 0)
+ goto err;
+ cxl_nvb->id = rc;
+
+ dev = &cxl_nvb->dev;
+ cxl_nvb->port = port;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_nvdimm_bridge_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &port->dev;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_nvdimm_bridge_type;
+
+ return cxl_nvb;
+
+err:
+ kfree(cxl_nvb);
+ return ERR_PTR(rc);
+}
+
+static void unregister_nvb(void *_cxl_nvb)
+{
+ struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
+
+ device_unregister(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology
+ * @host: platform firmware root device
+ * @port: CXL port at the root of a CXL topology
+ *
+ * Return: bridge device that can host cxl_nvdimm objects
+ */
+struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
+ struct cxl_port *port)
+{
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct device *dev;
+ int rc;
+
+ if (!IS_ENABLED(CONFIG_CXL_PMEM))
+ return ERR_PTR(-ENXIO);
+
+ cxl_nvb = cxl_nvdimm_bridge_alloc(port);
+ if (IS_ERR(cxl_nvb))
+ return cxl_nvb;
+
+ dev = &cxl_nvb->dev;
+ rc = dev_set_name(dev, "nvdimm-bridge%d", cxl_nvb->id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return cxl_nvb;
+
+err:
+ put_device(dev);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, "CXL");
+
+static void cxl_nvdimm_release(struct device *dev)
+{
+ struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
+
+ kfree(cxl_nvd);
+}
+
+static const struct attribute_group *cxl_nvdimm_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+const struct device_type cxl_nvdimm_type = {
+ .name = "cxl_nvdimm",
+ .release = cxl_nvdimm_release,
+ .groups = cxl_nvdimm_attribute_groups,
+};
+
+bool is_cxl_nvdimm(struct device *dev)
+{
+ return dev->type == &cxl_nvdimm_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm, "CXL");
+
+struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_cxl_nvdimm(dev),
+ "not a cxl_nvdimm device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_nvdimm, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, "CXL");
+
+static struct lock_class_key cxl_nvdimm_key;
+
+static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge *cxl_nvb,
+ struct cxl_memdev *cxlmd)
+{
+ struct cxl_nvdimm *cxl_nvd;
+ struct device *dev;
+
+ cxl_nvd = kzalloc(sizeof(*cxl_nvd), GFP_KERNEL);
+ if (!cxl_nvd)
+ return ERR_PTR(-ENOMEM);
+
+ dev = &cxl_nvd->dev;
+ cxl_nvd->cxlmd = cxlmd;
+ cxlmd->cxl_nvd = cxl_nvd;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_nvdimm_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &cxlmd->dev;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_nvdimm_type;
+ /*
+ * A "%llx" string is 17-bytes vs dimm_id that is max
+ * NVDIMM_KEY_DESC_LEN
+ */
+ BUILD_BUG_ON(sizeof(cxl_nvd->dev_id) < 17 ||
+ sizeof(cxl_nvd->dev_id) > NVDIMM_KEY_DESC_LEN);
+ sprintf(cxl_nvd->dev_id, "%llx", cxlmd->cxlds->serial);
+
+ return cxl_nvd;
+}
+
+static void cxlmd_release_nvdimm(void *_cxlmd)
+{
+ struct cxl_memdev *cxlmd = _cxlmd;
+ struct cxl_nvdimm *cxl_nvd = cxlmd->cxl_nvd;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb;
+
+ cxl_nvd->cxlmd = NULL;
+ cxlmd->cxl_nvd = NULL;
+ cxlmd->cxl_nvb = NULL;
+ device_unregister(&cxl_nvd->dev);
+ put_device(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm
+ * @parent_port: parent port for the (to be added) @cxlmd endpoint port
+ * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+int devm_cxl_add_nvdimm(struct cxl_port *parent_port,
+ struct cxl_memdev *cxlmd)
+{
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct cxl_nvdimm *cxl_nvd;
+ struct device *dev;
+ int rc;
+
+ cxl_nvb = cxl_find_nvdimm_bridge(parent_port);
+ if (!cxl_nvb)
+ return -ENODEV;
+
+ cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd);
+ if (IS_ERR(cxl_nvd)) {
+ rc = PTR_ERR(cxl_nvd);
+ goto err_alloc;
+ }
+ cxlmd->cxl_nvb = cxl_nvb;
+
+ dev = &cxl_nvd->dev;
+ rc = dev_set_name(dev, "pmem%d", cxlmd->id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev));
+
+ /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */
+ return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd);
+
+err:
+ put_device(dev);
+err_alloc:
+ cxlmd->cxl_nvb = NULL;
+ cxlmd->cxl_nvd = NULL;
+ put_device(&cxl_nvb->dev);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, "CXL");
diff --git a/drivers/cxl/core/pmu.c b/drivers/cxl/core/pmu.c
new file mode 100644
index 000000000000..b3136d7664ab
--- /dev/null
+++ b/drivers/cxl/core/pmu.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Huawei. All rights reserved. */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <cxlmem.h>
+#include <pmu.h>
+#include <cxl.h>
+#include "core.h"
+
+static void cxl_pmu_release(struct device *dev)
+{
+ struct cxl_pmu *pmu = to_cxl_pmu(dev);
+
+ kfree(pmu);
+}
+
+const struct device_type cxl_pmu_type = {
+ .name = "cxl_pmu",
+ .release = cxl_pmu_release,
+};
+
+static void remove_dev(void *dev)
+{
+ device_unregister(dev);
+}
+
+int devm_cxl_pmu_add(struct device *parent, struct cxl_pmu_regs *regs,
+ int assoc_id, int index, enum cxl_pmu_type type)
+{
+ struct cxl_pmu *pmu;
+ struct device *dev;
+ int rc;
+
+ pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
+ if (!pmu)
+ return -ENOMEM;
+
+ pmu->assoc_id = assoc_id;
+ pmu->index = index;
+ pmu->type = type;
+ pmu->base = regs->pmu;
+ dev = &pmu->dev;
+ device_initialize(dev);
+ device_set_pm_not_required(dev);
+ dev->parent = parent;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_pmu_type;
+ switch (pmu->type) {
+ case CXL_PMU_MEMDEV:
+ rc = dev_set_name(dev, "pmu_mem%d.%d", assoc_id, index);
+ break;
+ }
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ return devm_add_action_or_reset(parent, remove_dev, dev);
+
+err:
+ put_device(&pmu->dev);
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_pmu_add, "CXL");
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
new file mode 100644
index 000000000000..fef3aa0c6680
--- /dev/null
+++ b/drivers/cxl/core/port.c
@@ -0,0 +1,2532 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#include <linux/platform_device.h>
+#include <linux/memregion.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/node.h>
+#include <cxl/einj.h>
+#include <cxlmem.h>
+#include <cxlpci.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl core
+ *
+ * The CXL core provides a set of interfaces that can be consumed by CXL aware
+ * drivers. The interfaces allow for creation, modification, and destruction of
+ * regions, memory devices, ports, and decoders. CXL aware drivers must register
+ * with the CXL core via these interfaces in order to be able to participate in
+ * cross-device interleave coordination. The CXL core also establishes and
+ * maintains the bridge to the nvdimm subsystem.
+ *
+ * CXL core introduces sysfs hierarchy to control the devices that are
+ * instantiated by the core.
+ */
+
+static DEFINE_IDA(cxl_port_ida);
+static DEFINE_XARRAY(cxl_root_buses);
+
+/*
+ * The terminal device in PCI is NULL and @platform_bus
+ * for platform devices (for cxl_test)
+ */
+static bool is_cxl_host_bridge(struct device *dev)
+{
+ return (!dev || dev == &platform_bus);
+}
+
+int cxl_num_decoders_committed(struct cxl_port *port)
+{
+ lockdep_assert_held(&cxl_rwsem.region);
+
+ return port->commit_end + 1;
+}
+
+static ssize_t devtype_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%s\n", dev->type->name);
+}
+static DEVICE_ATTR_RO(devtype);
+
+static int cxl_device_id(const struct device *dev)
+{
+ if (dev->type == &cxl_nvdimm_bridge_type)
+ return CXL_DEVICE_NVDIMM_BRIDGE;
+ if (dev->type == &cxl_nvdimm_type)
+ return CXL_DEVICE_NVDIMM;
+ if (dev->type == CXL_PMEM_REGION_TYPE())
+ return CXL_DEVICE_PMEM_REGION;
+ if (dev->type == CXL_DAX_REGION_TYPE())
+ return CXL_DEVICE_DAX_REGION;
+ if (is_cxl_port(dev)) {
+ if (is_cxl_root(to_cxl_port(dev)))
+ return CXL_DEVICE_ROOT;
+ return CXL_DEVICE_PORT;
+ }
+ if (is_cxl_memdev(dev))
+ return CXL_DEVICE_MEMORY_EXPANDER;
+ if (dev->type == CXL_REGION_TYPE())
+ return CXL_DEVICE_REGION;
+ if (dev->type == &cxl_pmu_type)
+ return CXL_DEVICE_PMU;
+ return 0;
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, CXL_MODALIAS_FMT "\n", cxl_device_id(dev));
+}
+static DEVICE_ATTR_RO(modalias);
+
+static struct attribute *cxl_base_attributes[] = {
+ &dev_attr_devtype.attr,
+ &dev_attr_modalias.attr,
+ NULL,
+};
+
+struct attribute_group cxl_base_attribute_group = {
+ .attrs = cxl_base_attributes,
+};
+
+static ssize_t start_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start);
+}
+static DEVICE_ATTR_ADMIN_RO(start);
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range));
+}
+static DEVICE_ATTR_RO(size);
+
+#define CXL_DECODER_FLAG_ATTR(name, flag) \
+static ssize_t name##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cxl_decoder *cxld = to_cxl_decoder(dev); \
+ \
+ return sysfs_emit(buf, "%s\n", \
+ (cxld->flags & (flag)) ? "1" : "0"); \
+} \
+static DEVICE_ATTR_RO(name)
+
+CXL_DECODER_FLAG_ATTR(cap_pmem, CXL_DECODER_F_PMEM);
+CXL_DECODER_FLAG_ATTR(cap_ram, CXL_DECODER_F_RAM);
+CXL_DECODER_FLAG_ATTR(cap_type2, CXL_DECODER_F_TYPE2);
+CXL_DECODER_FLAG_ATTR(cap_type3, CXL_DECODER_F_TYPE3);
+CXL_DECODER_FLAG_ATTR(locked, CXL_DECODER_F_LOCK);
+
+static ssize_t target_type_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ switch (cxld->target_type) {
+ case CXL_DECODER_DEVMEM:
+ return sysfs_emit(buf, "accelerator\n");
+ case CXL_DECODER_HOSTONLYMEM:
+ return sysfs_emit(buf, "expander\n");
+ }
+ return -ENXIO;
+}
+static DEVICE_ATTR_RO(target_type);
+
+static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf)
+{
+ struct cxl_decoder *cxld = &cxlsd->cxld;
+ ssize_t offset = 0;
+ int i, rc = 0;
+
+ for (i = 0; i < cxld->interleave_ways; i++) {
+ struct cxl_dport *dport = cxlsd->target[i];
+ struct cxl_dport *next = NULL;
+
+ if (!dport)
+ break;
+
+ if (i + 1 < cxld->interleave_ways)
+ next = cxlsd->target[i + 1];
+ rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id,
+ next ? "," : "");
+ if (rc < 0)
+ return rc;
+ offset += rc;
+ }
+
+ return offset;
+}
+
+static ssize_t target_list_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+ ssize_t offset;
+ int rc;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ rc = emit_target_list(cxlsd, buf);
+ if (rc < 0)
+ return rc;
+ offset = rc;
+
+ rc = sysfs_emit_at(buf, offset, "\n");
+ if (rc < 0)
+ return rc;
+
+ return offset + rc;
+}
+static DEVICE_ATTR_RO(target_list);
+
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ /* without @cxl_rwsem.dpa, make sure @part is not reloaded */
+ int part = READ_ONCE(cxled->part);
+ const char *desc;
+
+ if (part < 0)
+ desc = "none";
+ else
+ desc = cxlds->part[part].res.name;
+
+ return sysfs_emit(buf, "%s\n", desc);
+}
+
+static ssize_t mode_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ enum cxl_partition_mode mode;
+ ssize_t rc;
+
+ if (sysfs_streq(buf, "pmem"))
+ mode = CXL_PARTMODE_PMEM;
+ else if (sysfs_streq(buf, "ram"))
+ mode = CXL_PARTMODE_RAM;
+ else
+ return -EINVAL;
+
+ rc = cxl_dpa_set_part(cxled, mode);
+ if (rc)
+ return rc;
+
+ return len;
+}
+static DEVICE_ATTR_RW(mode);
+
+static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ return sysfs_emit(buf, "%#llx\n", (u64)cxl_dpa_resource_start(cxled));
+}
+static DEVICE_ATTR_RO(dpa_resource);
+
+static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ resource_size_t size = cxl_dpa_size(cxled);
+
+ return sysfs_emit(buf, "%pa\n", &size);
+}
+
+static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+ unsigned long long size;
+ ssize_t rc;
+
+ rc = kstrtoull(buf, 0, &size);
+ if (rc)
+ return rc;
+
+ if (!IS_ALIGNED(size, SZ_256M))
+ return -EINVAL;
+
+ rc = cxl_dpa_free(cxled);
+ if (rc)
+ return rc;
+
+ if (size == 0)
+ return len;
+
+ rc = cxl_dpa_alloc(cxled, size);
+ if (rc)
+ return rc;
+
+ return len;
+}
+static DEVICE_ATTR_RW(dpa_size);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ return sysfs_emit(buf, "%d\n", cxld->interleave_granularity);
+}
+
+static DEVICE_ATTR_RO(interleave_granularity);
+
+static ssize_t interleave_ways_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ return sysfs_emit(buf, "%d\n", cxld->interleave_ways);
+}
+
+static DEVICE_ATTR_RO(interleave_ways);
+
+static ssize_t qos_class_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+ return sysfs_emit(buf, "%d\n", cxlrd->qos_class);
+}
+static DEVICE_ATTR_RO(qos_class);
+
+static struct attribute *cxl_decoder_base_attrs[] = {
+ &dev_attr_start.attr,
+ &dev_attr_size.attr,
+ &dev_attr_locked.attr,
+ &dev_attr_interleave_granularity.attr,
+ &dev_attr_interleave_ways.attr,
+ NULL,
+};
+
+static struct attribute_group cxl_decoder_base_attribute_group = {
+ .attrs = cxl_decoder_base_attrs,
+};
+
+static struct attribute *cxl_decoder_root_attrs[] = {
+ &dev_attr_cap_pmem.attr,
+ &dev_attr_cap_ram.attr,
+ &dev_attr_cap_type2.attr,
+ &dev_attr_cap_type3.attr,
+ &dev_attr_target_list.attr,
+ &dev_attr_qos_class.attr,
+ SET_CXL_REGION_ATTR(create_pmem_region)
+ SET_CXL_REGION_ATTR(create_ram_region)
+ SET_CXL_REGION_ATTR(delete_region)
+ NULL,
+};
+
+static bool can_create_pmem(struct cxl_root_decoder *cxlrd)
+{
+ unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM;
+
+ return (cxlrd->cxlsd.cxld.flags & flags) == flags;
+}
+
+static bool can_create_ram(struct cxl_root_decoder *cxlrd)
+{
+ unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_RAM;
+
+ return (cxlrd->cxlsd.cxld.flags & flags) == flags;
+}
+
+static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+ if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd))
+ return 0;
+
+ if (a == CXL_REGION_ATTR(create_ram_region) && !can_create_ram(cxlrd))
+ return 0;
+
+ if (a == CXL_REGION_ATTR(delete_region) &&
+ !(can_create_pmem(cxlrd) || can_create_ram(cxlrd)))
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute_group cxl_decoder_root_attribute_group = {
+ .attrs = cxl_decoder_root_attrs,
+ .is_visible = cxl_root_decoder_visible,
+};
+
+static const struct attribute_group *cxl_decoder_root_attribute_groups[] = {
+ &cxl_decoder_root_attribute_group,
+ &cxl_decoder_base_attribute_group,
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+static struct attribute *cxl_decoder_switch_attrs[] = {
+ &dev_attr_target_type.attr,
+ &dev_attr_target_list.attr,
+ SET_CXL_REGION_ATTR(region)
+ NULL,
+};
+
+static struct attribute_group cxl_decoder_switch_attribute_group = {
+ .attrs = cxl_decoder_switch_attrs,
+};
+
+static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = {
+ &cxl_decoder_switch_attribute_group,
+ &cxl_decoder_base_attribute_group,
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+static struct attribute *cxl_decoder_endpoint_attrs[] = {
+ &dev_attr_target_type.attr,
+ &dev_attr_mode.attr,
+ &dev_attr_dpa_size.attr,
+ &dev_attr_dpa_resource.attr,
+ SET_CXL_REGION_ATTR(region)
+ NULL,
+};
+
+static struct attribute_group cxl_decoder_endpoint_attribute_group = {
+ .attrs = cxl_decoder_endpoint_attrs,
+};
+
+static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = {
+ &cxl_decoder_base_attribute_group,
+ &cxl_decoder_endpoint_attribute_group,
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+static void __cxl_decoder_release(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+ ida_free(&port->decoder_ida, cxld->id);
+ put_device(&port->dev);
+}
+
+static void cxl_endpoint_decoder_release(struct device *dev)
+{
+ struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev);
+
+ __cxl_decoder_release(&cxled->cxld);
+ kfree(cxled);
+}
+
+static void cxl_switch_decoder_release(struct device *dev)
+{
+ struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+ __cxl_decoder_release(&cxlsd->cxld);
+ kfree(cxlsd);
+}
+
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_root_decoder(dev),
+ "not a cxl_root_decoder device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, "CXL");
+
+static void cxl_root_decoder_release(struct device *dev)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+
+ if (atomic_read(&cxlrd->region_id) >= 0)
+ memregion_free(atomic_read(&cxlrd->region_id));
+ __cxl_decoder_release(&cxlrd->cxlsd.cxld);
+ kfree(cxlrd);
+}
+
+static const struct device_type cxl_decoder_endpoint_type = {
+ .name = "cxl_decoder_endpoint",
+ .release = cxl_endpoint_decoder_release,
+ .groups = cxl_decoder_endpoint_attribute_groups,
+};
+
+static const struct device_type cxl_decoder_switch_type = {
+ .name = "cxl_decoder_switch",
+ .release = cxl_switch_decoder_release,
+ .groups = cxl_decoder_switch_attribute_groups,
+};
+
+static const struct device_type cxl_decoder_root_type = {
+ .name = "cxl_decoder_root",
+ .release = cxl_root_decoder_release,
+ .groups = cxl_decoder_root_attribute_groups,
+};
+
+bool is_endpoint_decoder(struct device *dev)
+{
+ return dev->type == &cxl_decoder_endpoint_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_endpoint_decoder, "CXL");
+
+bool is_root_decoder(struct device *dev)
+{
+ return dev->type == &cxl_decoder_root_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_root_decoder, "CXL");
+
+bool is_switch_decoder(struct device *dev)
+{
+ return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_switch_decoder, "CXL");
+
+struct cxl_decoder *to_cxl_decoder(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev,
+ !is_switch_decoder(dev) && !is_endpoint_decoder(dev),
+ "not a cxl_decoder device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_decoder, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, "CXL");
+
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev),
+ "not a cxl_endpoint_decoder device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_endpoint_decoder, cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, "CXL");
+
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_switch_decoder(dev),
+ "not a cxl_switch_decoder device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_switch_decoder, cxld.dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_switch_decoder, "CXL");
+
+static void cxl_ep_release(struct cxl_ep *ep)
+{
+ put_device(ep->ep);
+ kfree(ep);
+}
+
+static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep)
+{
+ if (!ep)
+ return;
+ xa_erase(&port->endpoints, (unsigned long) ep->ep);
+ cxl_ep_release(ep);
+}
+
+static void cxl_port_release(struct device *dev)
+{
+ struct cxl_port *port = to_cxl_port(dev);
+ unsigned long index;
+ struct cxl_ep *ep;
+
+ xa_for_each(&port->endpoints, index, ep)
+ cxl_ep_remove(port, ep);
+ xa_destroy(&port->endpoints);
+ xa_destroy(&port->dports);
+ xa_destroy(&port->regions);
+ ida_free(&cxl_port_ida, port->id);
+ if (is_cxl_root(port))
+ kfree(to_cxl_root(port));
+ else
+ kfree(port);
+}
+
+static ssize_t decoders_committed_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_port *port = to_cxl_port(dev);
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port));
+}
+
+static DEVICE_ATTR_RO(decoders_committed);
+
+static struct attribute *cxl_port_attrs[] = {
+ &dev_attr_decoders_committed.attr,
+ NULL,
+};
+
+static struct attribute_group cxl_port_attribute_group = {
+ .attrs = cxl_port_attrs,
+};
+
+static const struct attribute_group *cxl_port_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ &cxl_port_attribute_group,
+ NULL,
+};
+
+static const struct device_type cxl_port_type = {
+ .name = "cxl_port",
+ .release = cxl_port_release,
+ .groups = cxl_port_attribute_groups,
+};
+
+bool is_cxl_port(const struct device *dev)
+{
+ return dev->type == &cxl_port_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_port, "CXL");
+
+struct cxl_port *to_cxl_port(const struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type,
+ "not a cxl_port device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_port, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_port, "CXL");
+
+struct cxl_port *parent_port_of(struct cxl_port *port)
+{
+ if (!port || !port->parent_dport)
+ return NULL;
+ return port->parent_dport->port;
+}
+
+static void unregister_port(void *_port)
+{
+ struct cxl_port *port = _port;
+ struct cxl_port *parent = parent_port_of(port);
+ struct device *lock_dev;
+
+ /*
+ * CXL root port's and the first level of ports are unregistered
+ * under the platform firmware device lock, all other ports are
+ * unregistered while holding their parent port lock.
+ */
+ if (!parent)
+ lock_dev = port->uport_dev;
+ else if (is_cxl_root(parent))
+ lock_dev = parent->uport_dev;
+ else
+ lock_dev = &parent->dev;
+
+ device_lock_assert(lock_dev);
+ port->dead = true;
+ device_unregister(&port->dev);
+}
+
+static void cxl_unlink_uport(void *_port)
+{
+ struct cxl_port *port = _port;
+
+ sysfs_remove_link(&port->dev.kobj, "uport");
+}
+
+static int devm_cxl_link_uport(struct device *host, struct cxl_port *port)
+{
+ int rc;
+
+ rc = sysfs_create_link(&port->dev.kobj, &port->uport_dev->kobj,
+ "uport");
+ if (rc)
+ return rc;
+ return devm_add_action_or_reset(host, cxl_unlink_uport, port);
+}
+
+static void cxl_unlink_parent_dport(void *_port)
+{
+ struct cxl_port *port = _port;
+
+ sysfs_remove_link(&port->dev.kobj, "parent_dport");
+}
+
+static int devm_cxl_link_parent_dport(struct device *host,
+ struct cxl_port *port,
+ struct cxl_dport *parent_dport)
+{
+ int rc;
+
+ if (!parent_dport)
+ return 0;
+
+ rc = sysfs_create_link(&port->dev.kobj, &parent_dport->dport_dev->kobj,
+ "parent_dport");
+ if (rc)
+ return rc;
+ return devm_add_action_or_reset(host, cxl_unlink_parent_dport, port);
+}
+
+static struct lock_class_key cxl_port_key;
+
+static struct cxl_port *cxl_port_alloc(struct device *uport_dev,
+ struct cxl_dport *parent_dport)
+{
+ struct cxl_root *cxl_root __free(kfree) = NULL;
+ struct cxl_port *port, *_port __free(kfree) = NULL;
+ struct device *dev;
+ int rc;
+
+ /* No parent_dport, root cxl_port */
+ if (!parent_dport) {
+ cxl_root = kzalloc(sizeof(*cxl_root), GFP_KERNEL);
+ if (!cxl_root)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ _port = kzalloc(sizeof(*port), GFP_KERNEL);
+ if (!_port)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ rc = ida_alloc(&cxl_port_ida, GFP_KERNEL);
+ if (rc < 0)
+ return ERR_PTR(rc);
+
+ if (cxl_root)
+ port = &no_free_ptr(cxl_root)->port;
+ else
+ port = no_free_ptr(_port);
+
+ port->id = rc;
+ port->uport_dev = uport_dev;
+
+ /*
+ * The top-level cxl_port "cxl_root" does not have a cxl_port as
+ * its parent and it does not have any corresponding component
+ * registers as its decode is described by a fixed platform
+ * description.
+ */
+ dev = &port->dev;
+ if (parent_dport) {
+ struct cxl_port *parent_port = parent_dport->port;
+ struct cxl_port *iter;
+
+ dev->parent = &parent_port->dev;
+ port->depth = parent_port->depth + 1;
+ port->parent_dport = parent_dport;
+
+ /*
+ * walk to the host bridge, or the first ancestor that knows
+ * the host bridge
+ */
+ iter = port;
+ while (!iter->host_bridge &&
+ !is_cxl_root(to_cxl_port(iter->dev.parent)))
+ iter = to_cxl_port(iter->dev.parent);
+ if (iter->host_bridge)
+ port->host_bridge = iter->host_bridge;
+ else if (parent_dport->rch)
+ port->host_bridge = parent_dport->dport_dev;
+ else
+ port->host_bridge = iter->uport_dev;
+ dev_dbg(uport_dev, "host-bridge: %s\n",
+ dev_name(port->host_bridge));
+ } else
+ dev->parent = uport_dev;
+
+ ida_init(&port->decoder_ida);
+ port->hdm_end = -1;
+ port->commit_end = -1;
+ xa_init(&port->dports);
+ xa_init(&port->endpoints);
+ xa_init(&port->regions);
+ port->component_reg_phys = CXL_RESOURCE_NONE;
+
+ device_initialize(dev);
+ lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth);
+ device_set_pm_not_required(dev);
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_port_type;
+
+ return port;
+}
+
+static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map,
+ resource_size_t component_reg_phys)
+{
+ *map = (struct cxl_register_map) {
+ .host = host,
+ .reg_type = CXL_REGLOC_RBI_EMPTY,
+ .resource = component_reg_phys,
+ };
+
+ if (component_reg_phys == CXL_RESOURCE_NONE)
+ return 0;
+
+ map->reg_type = CXL_REGLOC_RBI_COMPONENT;
+ map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
+
+ return cxl_setup_regs(map);
+}
+
+static int cxl_port_setup_regs(struct cxl_port *port,
+ resource_size_t component_reg_phys)
+{
+ if (dev_is_platform(port->uport_dev))
+ return 0;
+ return cxl_setup_comp_regs(&port->dev, &port->reg_map,
+ component_reg_phys);
+}
+
+static int cxl_dport_setup_regs(struct device *host, struct cxl_dport *dport,
+ resource_size_t component_reg_phys)
+{
+ int rc;
+
+ if (dev_is_platform(dport->dport_dev))
+ return 0;
+
+ /*
+ * use @dport->dport_dev for the context for error messages during
+ * register probing, and fixup @host after the fact, since @host may be
+ * NULL.
+ */
+ rc = cxl_setup_comp_regs(dport->dport_dev, &dport->reg_map,
+ component_reg_phys);
+ dport->reg_map.host = host;
+ return rc;
+}
+
+DEFINE_SHOW_ATTRIBUTE(einj_cxl_available_error_type);
+
+static int cxl_einj_inject(void *data, u64 type)
+{
+ struct cxl_dport *dport = data;
+
+ if (dport->rch)
+ return einj_cxl_inject_rch_error(dport->rcrb.base, type);
+
+ return einj_cxl_inject_error(to_pci_dev(dport->dport_dev), type);
+}
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_einj_inject_fops, NULL, cxl_einj_inject,
+ "0x%llx\n");
+
+static void cxl_debugfs_create_dport_dir(struct cxl_dport *dport)
+{
+ struct dentry *dir;
+
+ if (!einj_cxl_is_initialized())
+ return;
+
+ /*
+ * dport_dev needs to be a PCIe port for CXL 2.0+ ports because
+ * EINJ expects a dport SBDF to be specified for 2.0 error injection.
+ */
+ if (!dport->rch && !dev_is_pci(dport->dport_dev))
+ return;
+
+ dir = cxl_debugfs_create_dir(dev_name(dport->dport_dev));
+
+ debugfs_create_file("einj_inject", 0200, dir, dport,
+ &cxl_einj_inject_fops);
+}
+
+static int cxl_port_add(struct cxl_port *port,
+ resource_size_t component_reg_phys,
+ struct cxl_dport *parent_dport)
+{
+ struct device *dev __free(put_device) = &port->dev;
+ int rc;
+
+ if (is_cxl_memdev(port->uport_dev)) {
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ rc = dev_set_name(dev, "endpoint%d", port->id);
+ if (rc)
+ return rc;
+
+ /*
+ * The endpoint driver already enumerated the component and RAS
+ * registers. Reuse that enumeration while prepping them to be
+ * mapped by the cxl_port driver.
+ */
+ port->reg_map = cxlds->reg_map;
+ port->reg_map.host = &port->dev;
+ cxlmd->endpoint = port;
+ } else if (parent_dport) {
+ rc = dev_set_name(dev, "port%d", port->id);
+ if (rc)
+ return rc;
+
+ port->component_reg_phys = component_reg_phys;
+ } else {
+ rc = dev_set_name(dev, "root%d", port->id);
+ if (rc)
+ return rc;
+ }
+
+ rc = device_add(dev);
+ if (rc)
+ return rc;
+
+ /* Inhibit the cleanup function invoked */
+ dev = NULL;
+ return 0;
+}
+
+static struct cxl_port *__devm_cxl_add_port(struct device *host,
+ struct device *uport_dev,
+ resource_size_t component_reg_phys,
+ struct cxl_dport *parent_dport)
+{
+ struct cxl_port *port;
+ int rc;
+
+ port = cxl_port_alloc(uport_dev, parent_dport);
+ if (IS_ERR(port))
+ return port;
+
+ rc = cxl_port_add(port, component_reg_phys, parent_dport);
+ if (rc)
+ return ERR_PTR(rc);
+
+ rc = devm_add_action_or_reset(host, unregister_port, port);
+ if (rc)
+ return ERR_PTR(rc);
+
+ rc = devm_cxl_link_uport(host, port);
+ if (rc)
+ return ERR_PTR(rc);
+
+ rc = devm_cxl_link_parent_dport(host, port, parent_dport);
+ if (rc)
+ return ERR_PTR(rc);
+
+ if (parent_dport && dev_is_pci(uport_dev))
+ port->pci_latency = cxl_pci_get_latency(to_pci_dev(uport_dev));
+
+ return port;
+}
+
+/**
+ * devm_cxl_add_port - register a cxl_port in CXL memory decode hierarchy
+ * @host: host device for devm operations
+ * @uport_dev: "physical" device implementing this upstream port
+ * @component_reg_phys: (optional) for configurable cxl_port instances
+ * @parent_dport: next hop up in the CXL memory decode hierarchy
+ */
+struct cxl_port *devm_cxl_add_port(struct device *host,
+ struct device *uport_dev,
+ resource_size_t component_reg_phys,
+ struct cxl_dport *parent_dport)
+{
+ struct cxl_port *port, *parent_port;
+
+ port = __devm_cxl_add_port(host, uport_dev, component_reg_phys,
+ parent_dport);
+
+ parent_port = parent_dport ? parent_dport->port : NULL;
+ if (IS_ERR(port)) {
+ dev_dbg(uport_dev, "Failed to add%s%s%s: %ld\n",
+ parent_port ? " port to " : "",
+ parent_port ? dev_name(&parent_port->dev) : "",
+ parent_port ? "" : " root port",
+ PTR_ERR(port));
+ } else {
+ dev_dbg(uport_dev, "%s added%s%s%s\n",
+ dev_name(&port->dev),
+ parent_port ? " to " : "",
+ parent_port ? dev_name(&parent_port->dev) : "",
+ parent_port ? "" : " (root port)");
+ }
+
+ return port;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, "CXL");
+
+struct cxl_root *devm_cxl_add_root(struct device *host,
+ const struct cxl_root_ops *ops)
+{
+ struct cxl_root *cxl_root;
+ struct cxl_port *port;
+
+ port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL);
+ if (IS_ERR(port))
+ return ERR_CAST(port);
+
+ cxl_root = to_cxl_root(port);
+ cxl_root->ops = ops;
+ return cxl_root;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_root, "CXL");
+
+struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port)
+{
+ /* There is no pci_bus associated with a CXL platform-root port */
+ if (is_cxl_root(port))
+ return NULL;
+
+ if (dev_is_pci(port->uport_dev)) {
+ struct pci_dev *pdev = to_pci_dev(port->uport_dev);
+
+ return pdev->subordinate;
+ }
+
+ return xa_load(&cxl_root_buses, (unsigned long)port->uport_dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_to_pci_bus, "CXL");
+
+static void unregister_pci_bus(void *uport_dev)
+{
+ xa_erase(&cxl_root_buses, (unsigned long)uport_dev);
+}
+
+int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
+ struct pci_bus *bus)
+{
+ int rc;
+
+ if (dev_is_pci(uport_dev))
+ return -EINVAL;
+
+ rc = xa_insert(&cxl_root_buses, (unsigned long)uport_dev, bus,
+ GFP_KERNEL);
+ if (rc)
+ return rc;
+ return devm_add_action_or_reset(host, unregister_pci_bus, uport_dev);
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, "CXL");
+
+static bool dev_is_cxl_root_child(struct device *dev)
+{
+ struct cxl_port *port, *parent;
+
+ if (!is_cxl_port(dev))
+ return false;
+
+ port = to_cxl_port(dev);
+ if (is_cxl_root(port))
+ return false;
+
+ parent = to_cxl_port(port->dev.parent);
+ if (is_cxl_root(parent))
+ return true;
+
+ return false;
+}
+
+struct cxl_root *find_cxl_root(struct cxl_port *port)
+{
+ struct cxl_port *iter = port;
+
+ while (iter && !is_cxl_root(iter))
+ iter = to_cxl_port(iter->dev.parent);
+
+ if (!iter)
+ return NULL;
+ get_device(&iter->dev);
+ return to_cxl_root(iter);
+}
+EXPORT_SYMBOL_NS_GPL(find_cxl_root, "CXL");
+
+static struct cxl_dport *find_dport(struct cxl_port *port, int id)
+{
+ struct cxl_dport *dport;
+ unsigned long index;
+
+ device_lock_assert(&port->dev);
+ xa_for_each(&port->dports, index, dport)
+ if (dport->port_id == id)
+ return dport;
+ return NULL;
+}
+
+static int add_dport(struct cxl_port *port, struct cxl_dport *dport)
+{
+ struct cxl_dport *dup;
+ int rc;
+
+ device_lock_assert(&port->dev);
+ dup = find_dport(port, dport->port_id);
+ if (dup) {
+ dev_err(&port->dev,
+ "unable to add dport%d-%s non-unique port id (%s)\n",
+ dport->port_id, dev_name(dport->dport_dev),
+ dev_name(dup->dport_dev));
+ return -EBUSY;
+ }
+
+ rc = xa_insert(&port->dports, (unsigned long)dport->dport_dev, dport,
+ GFP_KERNEL);
+ if (rc)
+ return rc;
+
+ port->nr_dports++;
+ return 0;
+}
+
+/*
+ * Since root-level CXL dports cannot be enumerated by PCI they are not
+ * enumerated by the common port driver that acquires the port lock over
+ * dport add/remove. Instead, root dports are manually added by a
+ * platform driver and cond_cxl_root_lock() is used to take the missing
+ * port lock in that case.
+ */
+static void cond_cxl_root_lock(struct cxl_port *port)
+{
+ if (is_cxl_root(port))
+ device_lock(&port->dev);
+}
+
+static void cond_cxl_root_unlock(struct cxl_port *port)
+{
+ if (is_cxl_root(port))
+ device_unlock(&port->dev);
+}
+
+static void cxl_dport_remove(void *data)
+{
+ struct cxl_dport *dport = data;
+ struct cxl_port *port = dport->port;
+
+ xa_erase(&port->dports, (unsigned long) dport->dport_dev);
+ put_device(dport->dport_dev);
+}
+
+static void cxl_dport_unlink(void *data)
+{
+ struct cxl_dport *dport = data;
+ struct cxl_port *port = dport->port;
+ char link_name[CXL_TARGET_STRLEN];
+
+ sprintf(link_name, "dport%d", dport->port_id);
+ sysfs_remove_link(&port->dev.kobj, link_name);
+}
+
+static struct cxl_dport *
+__devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev,
+ int port_id, resource_size_t component_reg_phys,
+ resource_size_t rcrb)
+{
+ char link_name[CXL_TARGET_STRLEN];
+ struct cxl_dport *dport;
+ struct device *host;
+ int rc;
+
+ if (is_cxl_root(port))
+ host = port->uport_dev;
+ else
+ host = &port->dev;
+
+ if (!host->driver) {
+ dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n",
+ dev_name(dport_dev));
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >=
+ CXL_TARGET_STRLEN)
+ return ERR_PTR(-EINVAL);
+
+ dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL);
+ if (!dport)
+ return ERR_PTR(-ENOMEM);
+
+ dport->dport_dev = dport_dev;
+ dport->port_id = port_id;
+ dport->port = port;
+
+ if (rcrb == CXL_RESOURCE_NONE) {
+ rc = cxl_dport_setup_regs(&port->dev, dport,
+ component_reg_phys);
+ if (rc)
+ return ERR_PTR(rc);
+ } else {
+ dport->rcrb.base = rcrb;
+ component_reg_phys = __rcrb_to_component(dport_dev, &dport->rcrb,
+ CXL_RCRB_DOWNSTREAM);
+ if (component_reg_phys == CXL_RESOURCE_NONE) {
+ dev_warn(dport_dev, "Invalid Component Registers in RCRB");
+ return ERR_PTR(-ENXIO);
+ }
+
+ /*
+ * RCH @dport is not ready to map until associated with its
+ * memdev
+ */
+ rc = cxl_dport_setup_regs(NULL, dport, component_reg_phys);
+ if (rc)
+ return ERR_PTR(rc);
+
+ dport->rch = true;
+ }
+
+ if (component_reg_phys != CXL_RESOURCE_NONE)
+ dev_dbg(dport_dev, "Component Registers found for dport: %pa\n",
+ &component_reg_phys);
+
+ cond_cxl_root_lock(port);
+ rc = add_dport(port, dport);
+ cond_cxl_root_unlock(port);
+ if (rc)
+ return ERR_PTR(rc);
+
+ /*
+ * Setup port register if this is the first dport showed up. Having
+ * a dport also means that there is at least 1 active link.
+ */
+ if (port->nr_dports == 1 &&
+ port->component_reg_phys != CXL_RESOURCE_NONE) {
+ rc = cxl_port_setup_regs(port, port->component_reg_phys);
+ if (rc) {
+ xa_erase(&port->dports, (unsigned long)dport->dport_dev);
+ return ERR_PTR(rc);
+ }
+ port->component_reg_phys = CXL_RESOURCE_NONE;
+ }
+
+ get_device(dport_dev);
+ rc = devm_add_action_or_reset(host, cxl_dport_remove, dport);
+ if (rc)
+ return ERR_PTR(rc);
+
+ rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name);
+ if (rc)
+ return ERR_PTR(rc);
+
+ rc = devm_add_action_or_reset(host, cxl_dport_unlink, dport);
+ if (rc)
+ return ERR_PTR(rc);
+
+ if (dev_is_pci(dport_dev))
+ dport->link_latency = cxl_pci_get_latency(to_pci_dev(dport_dev));
+
+ cxl_debugfs_create_dport_dir(dport);
+
+ return dport;
+}
+
+/**
+ * devm_cxl_add_dport - append VH downstream port data to a cxl_port
+ * @port: the cxl_port that references this dport
+ * @dport_dev: firmware or PCI device representing the dport
+ * @port_id: identifier for this dport in a decoder's target list
+ * @component_reg_phys: optional location of CXL component registers
+ *
+ * Note that dports are appended to the devm release action's of the
+ * either the port's host (for root ports), or the port itself (for
+ * switch ports)
+ */
+struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
+ struct device *dport_dev, int port_id,
+ resource_size_t component_reg_phys)
+{
+ struct cxl_dport *dport;
+
+ dport = __devm_cxl_add_dport(port, dport_dev, port_id,
+ component_reg_phys, CXL_RESOURCE_NONE);
+ if (IS_ERR(dport)) {
+ dev_dbg(dport_dev, "failed to add dport to %s: %ld\n",
+ dev_name(&port->dev), PTR_ERR(dport));
+ } else {
+ dev_dbg(dport_dev, "dport added to %s\n",
+ dev_name(&port->dev));
+ }
+
+ return dport;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, "CXL");
+
+/**
+ * devm_cxl_add_rch_dport - append RCH downstream port data to a cxl_port
+ * @port: the cxl_port that references this dport
+ * @dport_dev: firmware or PCI device representing the dport
+ * @port_id: identifier for this dport in a decoder's target list
+ * @rcrb: mandatory location of a Root Complex Register Block
+ *
+ * See CXL 3.0 9.11.8 CXL Devices Attached to an RCH
+ */
+struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
+ struct device *dport_dev, int port_id,
+ resource_size_t rcrb)
+{
+ struct cxl_dport *dport;
+
+ if (rcrb == CXL_RESOURCE_NONE) {
+ dev_dbg(&port->dev, "failed to add RCH dport, missing RCRB\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ dport = __devm_cxl_add_dport(port, dport_dev, port_id,
+ CXL_RESOURCE_NONE, rcrb);
+ if (IS_ERR(dport)) {
+ dev_dbg(dport_dev, "failed to add RCH dport to %s: %ld\n",
+ dev_name(&port->dev), PTR_ERR(dport));
+ } else {
+ dev_dbg(dport_dev, "RCH dport added to %s\n",
+ dev_name(&port->dev));
+ }
+
+ return dport;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, "CXL");
+
+static int add_ep(struct cxl_ep *new)
+{
+ struct cxl_port *port = new->dport->port;
+
+ guard(device)(&port->dev);
+ if (port->dead)
+ return -ENXIO;
+
+ return xa_insert(&port->endpoints, (unsigned long)new->ep,
+ new, GFP_KERNEL);
+}
+
+/**
+ * cxl_add_ep - register an endpoint's interest in a port
+ * @dport: the dport that routes to @ep_dev
+ * @ep_dev: device representing the endpoint
+ *
+ * Intermediate CXL ports are scanned based on the arrival of endpoints.
+ * When those endpoints depart the port can be destroyed once all
+ * endpoints that care about that port have been removed.
+ */
+static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev)
+{
+ struct cxl_ep *ep;
+ int rc;
+
+ ep = kzalloc(sizeof(*ep), GFP_KERNEL);
+ if (!ep)
+ return -ENOMEM;
+
+ ep->ep = get_device(ep_dev);
+ ep->dport = dport;
+
+ rc = add_ep(ep);
+ if (rc)
+ cxl_ep_release(ep);
+ return rc;
+}
+
+struct cxl_find_port_ctx {
+ const struct device *dport_dev;
+ const struct cxl_port *parent_port;
+ struct cxl_dport **dport;
+};
+
+static int match_port_by_dport(struct device *dev, const void *data)
+{
+ const struct cxl_find_port_ctx *ctx = data;
+ struct cxl_dport *dport;
+ struct cxl_port *port;
+
+ if (!is_cxl_port(dev))
+ return 0;
+ if (ctx->parent_port && dev->parent != &ctx->parent_port->dev)
+ return 0;
+
+ port = to_cxl_port(dev);
+ dport = cxl_find_dport_by_dev(port, ctx->dport_dev);
+ if (ctx->dport)
+ *ctx->dport = dport;
+ return dport != NULL;
+}
+
+static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx)
+{
+ struct device *dev;
+
+ if (!ctx->dport_dev)
+ return NULL;
+
+ dev = bus_find_device(&cxl_bus_type, NULL, ctx, match_port_by_dport);
+ if (dev)
+ return to_cxl_port(dev);
+ return NULL;
+}
+
+static struct cxl_port *find_cxl_port(struct device *dport_dev,
+ struct cxl_dport **dport)
+{
+ struct cxl_find_port_ctx ctx = {
+ .dport_dev = dport_dev,
+ .dport = dport,
+ };
+ struct cxl_port *port;
+
+ port = __find_cxl_port(&ctx);
+ return port;
+}
+
+/*
+ * All users of grandparent() are using it to walk PCIe-like switch port
+ * hierarchy. A PCIe switch is comprised of a bridge device representing the
+ * upstream switch port and N bridges representing downstream switch ports. When
+ * bridges stack the grand-parent of a downstream switch port is another
+ * downstream switch port in the immediate ancestor switch.
+ */
+static struct device *grandparent(struct device *dev)
+{
+ if (dev && dev->parent)
+ return dev->parent->parent;
+ return NULL;
+}
+
+static struct device *endpoint_host(struct cxl_port *endpoint)
+{
+ struct cxl_port *port = to_cxl_port(endpoint->dev.parent);
+
+ if (is_cxl_root(port))
+ return port->uport_dev;
+ return &port->dev;
+}
+
+static void delete_endpoint(void *data)
+{
+ struct cxl_memdev *cxlmd = data;
+ struct cxl_port *endpoint = cxlmd->endpoint;
+ struct device *host = endpoint_host(endpoint);
+
+ scoped_guard(device, host) {
+ if (host->driver && !endpoint->dead) {
+ devm_release_action(host, cxl_unlink_parent_dport, endpoint);
+ devm_release_action(host, cxl_unlink_uport, endpoint);
+ devm_release_action(host, unregister_port, endpoint);
+ }
+ cxlmd->endpoint = NULL;
+ }
+ put_device(&endpoint->dev);
+ put_device(host);
+}
+
+int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint)
+{
+ struct device *host = endpoint_host(endpoint);
+ struct device *dev = &cxlmd->dev;
+
+ get_device(host);
+ get_device(&endpoint->dev);
+ cxlmd->depth = endpoint->depth;
+ return devm_add_action_or_reset(dev, delete_endpoint, cxlmd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, "CXL");
+
+/*
+ * The natural end of life of a non-root 'cxl_port' is when its parent port goes
+ * through a ->remove() event ("top-down" unregistration). The unnatural trigger
+ * for a port to be unregistered is when all memdevs beneath that port have gone
+ * through ->remove(). This "bottom-up" removal selectively removes individual
+ * child ports manually. This depends on devm_cxl_add_port() to not change is
+ * devm action registration order, and for dports to have already been
+ * destroyed by del_dports().
+ */
+static void delete_switch_port(struct cxl_port *port)
+{
+ devm_release_action(port->dev.parent, cxl_unlink_parent_dport, port);
+ devm_release_action(port->dev.parent, cxl_unlink_uport, port);
+ devm_release_action(port->dev.parent, unregister_port, port);
+}
+
+static void del_dport(struct cxl_dport *dport)
+{
+ struct cxl_port *port = dport->port;
+
+ devm_release_action(&port->dev, cxl_dport_unlink, dport);
+ devm_release_action(&port->dev, cxl_dport_remove, dport);
+ devm_kfree(&port->dev, dport);
+}
+
+static void del_dports(struct cxl_port *port)
+{
+ struct cxl_dport *dport;
+ unsigned long index;
+
+ device_lock_assert(&port->dev);
+
+ xa_for_each(&port->dports, index, dport)
+ del_dport(dport);
+}
+
+struct detach_ctx {
+ struct cxl_memdev *cxlmd;
+ int depth;
+};
+
+static int port_has_memdev(struct device *dev, const void *data)
+{
+ const struct detach_ctx *ctx = data;
+ struct cxl_port *port;
+
+ if (!is_cxl_port(dev))
+ return 0;
+
+ port = to_cxl_port(dev);
+ if (port->depth != ctx->depth)
+ return 0;
+
+ return !!cxl_ep_load(port, ctx->cxlmd);
+}
+
+static void cxl_detach_ep(void *data)
+{
+ struct cxl_memdev *cxlmd = data;
+
+ for (int i = cxlmd->depth - 1; i >= 1; i--) {
+ struct cxl_port *port, *parent_port;
+ struct detach_ctx ctx = {
+ .cxlmd = cxlmd,
+ .depth = i,
+ };
+ struct cxl_ep *ep;
+ bool died = false;
+
+ struct device *dev __free(put_device) =
+ bus_find_device(&cxl_bus_type, NULL, &ctx, port_has_memdev);
+ if (!dev)
+ continue;
+ port = to_cxl_port(dev);
+
+ parent_port = to_cxl_port(port->dev.parent);
+ device_lock(&parent_port->dev);
+ device_lock(&port->dev);
+ ep = cxl_ep_load(port, cxlmd);
+ dev_dbg(&cxlmd->dev, "disconnect %s from %s\n",
+ ep ? dev_name(ep->ep) : "", dev_name(&port->dev));
+ cxl_ep_remove(port, ep);
+ if (ep && !port->dead && xa_empty(&port->endpoints) &&
+ !is_cxl_root(parent_port) && parent_port->dev.driver) {
+ /*
+ * This was the last ep attached to a dynamically
+ * enumerated port. Block new cxl_add_ep() and garbage
+ * collect the port.
+ */
+ died = true;
+ port->dead = true;
+ del_dports(port);
+ }
+ device_unlock(&port->dev);
+
+ if (died) {
+ dev_dbg(&cxlmd->dev, "delete %s\n",
+ dev_name(&port->dev));
+ delete_switch_port(port);
+ }
+ device_unlock(&parent_port->dev);
+ }
+}
+
+static resource_size_t find_component_registers(struct device *dev)
+{
+ struct cxl_register_map map;
+ struct pci_dev *pdev;
+
+ /*
+ * Theoretically, CXL component registers can be hosted on a
+ * non-PCI device, in practice, only cxl_test hits this case.
+ */
+ if (!dev_is_pci(dev))
+ return CXL_RESOURCE_NONE;
+
+ pdev = to_pci_dev(dev);
+
+ cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+ return map.resource;
+}
+
+static int match_port_by_uport(struct device *dev, const void *data)
+{
+ const struct device *uport_dev = data;
+ struct cxl_port *port;
+
+ if (!is_cxl_port(dev))
+ return 0;
+
+ port = to_cxl_port(dev);
+ return uport_dev == port->uport_dev;
+}
+
+/*
+ * Function takes a device reference on the port device. Caller should do a
+ * put_device() when done.
+ */
+static struct cxl_port *find_cxl_port_by_uport(struct device *uport_dev)
+{
+ struct device *dev;
+
+ dev = bus_find_device(&cxl_bus_type, NULL, uport_dev, match_port_by_uport);
+ if (dev)
+ return to_cxl_port(dev);
+ return NULL;
+}
+
+static int update_decoder_targets(struct device *dev, void *data)
+{
+ struct cxl_dport *dport = data;
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_decoder *cxld;
+ int i;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxlsd = to_cxl_switch_decoder(dev);
+ cxld = &cxlsd->cxld;
+ guard(rwsem_write)(&cxl_rwsem.region);
+
+ for (i = 0; i < cxld->interleave_ways; i++) {
+ if (cxld->target_map[i] == dport->port_id) {
+ cxlsd->target[i] = dport;
+ dev_dbg(dev, "dport%d found in target list, index %d\n",
+ dport->port_id, i);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+DEFINE_FREE(del_cxl_dport, struct cxl_dport *, if (!IS_ERR_OR_NULL(_T)) del_dport(_T))
+static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port,
+ struct device *dport_dev)
+{
+ struct cxl_dport *dport;
+ int rc;
+
+ device_lock_assert(&port->dev);
+ if (!port->dev.driver)
+ return ERR_PTR(-ENXIO);
+
+ dport = cxl_find_dport_by_dev(port, dport_dev);
+ if (dport) {
+ dev_dbg(&port->dev, "dport%d:%s already exists\n",
+ dport->port_id, dev_name(dport_dev));
+ return ERR_PTR(-EBUSY);
+ }
+
+ struct cxl_dport *new_dport __free(del_cxl_dport) =
+ devm_cxl_add_dport_by_dev(port, dport_dev);
+ if (IS_ERR(new_dport))
+ return new_dport;
+
+ cxl_switch_parse_cdat(new_dport);
+
+ if (ida_is_empty(&port->decoder_ida)) {
+ rc = devm_cxl_switch_port_decoders_setup(port);
+ if (rc)
+ return ERR_PTR(rc);
+ dev_dbg(&port->dev, "first dport%d:%s added with decoders\n",
+ new_dport->port_id, dev_name(dport_dev));
+ return no_free_ptr(new_dport);
+ }
+
+ /* New dport added, update the decoder targets */
+ device_for_each_child(&port->dev, new_dport, update_decoder_targets);
+
+ dev_dbg(&port->dev, "dport%d:%s added\n", new_dport->port_id,
+ dev_name(dport_dev));
+
+ return no_free_ptr(new_dport);
+}
+
+static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev,
+ struct cxl_port *parent_port,
+ struct cxl_dport *parent_dport,
+ struct device *uport_dev,
+ struct device *dport_dev)
+{
+ resource_size_t component_reg_phys;
+
+ device_lock_assert(&parent_port->dev);
+ if (!parent_port->dev.driver) {
+ dev_warn(ep_dev,
+ "port %s:%s:%s disabled, failed to enumerate CXL.mem\n",
+ dev_name(&parent_port->dev), dev_name(uport_dev),
+ dev_name(dport_dev));
+ }
+
+ struct cxl_port *port __free(put_cxl_port) =
+ find_cxl_port_by_uport(uport_dev);
+ if (!port) {
+ component_reg_phys = find_component_registers(uport_dev);
+ port = devm_cxl_add_port(&parent_port->dev, uport_dev,
+ component_reg_phys, parent_dport);
+ if (IS_ERR(port))
+ return ERR_CAST(port);
+
+ /*
+ * retry to make sure a port is found. a port device
+ * reference is taken.
+ */
+ port = find_cxl_port_by_uport(uport_dev);
+ if (!port)
+ return ERR_PTR(-ENODEV);
+
+ dev_dbg(ep_dev, "created port %s:%s\n",
+ dev_name(&port->dev), dev_name(port->uport_dev));
+ } else {
+ /*
+ * Port was created before right before this function is
+ * called. Signal the caller to deal with it.
+ */
+ return ERR_PTR(-EAGAIN);
+ }
+
+ guard(device)(&port->dev);
+ return cxl_port_add_dport(port, dport_dev);
+}
+
+static int add_port_attach_ep(struct cxl_memdev *cxlmd,
+ struct device *uport_dev,
+ struct device *dport_dev)
+{
+ struct device *dparent = grandparent(dport_dev);
+ struct cxl_dport *dport, *parent_dport;
+ int rc;
+
+ if (is_cxl_host_bridge(dparent)) {
+ /*
+ * The iteration reached the topology root without finding the
+ * CXL-root 'cxl_port' on a previous iteration, fail for now to
+ * be re-probed after platform driver attaches.
+ */
+ dev_dbg(&cxlmd->dev, "%s is a root dport\n",
+ dev_name(dport_dev));
+ return -ENXIO;
+ }
+
+ struct cxl_port *parent_port __free(put_cxl_port) =
+ find_cxl_port_by_uport(dparent->parent);
+ if (!parent_port) {
+ /* iterate to create this parent_port */
+ return -EAGAIN;
+ }
+
+ scoped_guard(device, &parent_port->dev) {
+ parent_dport = cxl_find_dport_by_dev(parent_port, dparent);
+ if (!parent_dport) {
+ parent_dport = cxl_port_add_dport(parent_port, dparent);
+ if (IS_ERR(parent_dport))
+ return PTR_ERR(parent_dport);
+ }
+
+ dport = devm_cxl_create_port(&cxlmd->dev, parent_port,
+ parent_dport, uport_dev,
+ dport_dev);
+ if (IS_ERR(dport)) {
+ /* Port already exists, restart iteration */
+ if (PTR_ERR(dport) == -EAGAIN)
+ return 0;
+ return PTR_ERR(dport);
+ }
+ }
+
+ rc = cxl_add_ep(dport, &cxlmd->dev);
+ if (rc == -EBUSY) {
+ /*
+ * "can't" happen, but this error code means
+ * something to the caller, so translate it.
+ */
+ rc = -ENXIO;
+ }
+
+ return rc;
+}
+
+static struct cxl_dport *find_or_add_dport(struct cxl_port *port,
+ struct device *dport_dev)
+{
+ struct cxl_dport *dport;
+
+ device_lock_assert(&port->dev);
+ dport = cxl_find_dport_by_dev(port, dport_dev);
+ if (!dport) {
+ dport = cxl_port_add_dport(port, dport_dev);
+ if (IS_ERR(dport))
+ return dport;
+
+ /* New dport added, restart iteration */
+ return ERR_PTR(-EAGAIN);
+ }
+
+ return dport;
+}
+
+int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd)
+{
+ struct device *dev = &cxlmd->dev;
+ struct device *iter;
+ int rc;
+
+ /*
+ * Skip intermediate port enumeration in the RCH case, there
+ * are no ports in between a host bridge and an endpoint.
+ */
+ if (cxlmd->cxlds->rcd)
+ return 0;
+
+ rc = devm_add_action_or_reset(&cxlmd->dev, cxl_detach_ep, cxlmd);
+ if (rc)
+ return rc;
+
+ /*
+ * Scan for and add all cxl_ports in this device's ancestry.
+ * Repeat until no more ports are added. Abort if a port add
+ * attempt fails.
+ */
+retry:
+ for (iter = dev; iter; iter = grandparent(iter)) {
+ struct device *dport_dev = grandparent(iter);
+ struct device *uport_dev;
+ struct cxl_dport *dport;
+
+ if (is_cxl_host_bridge(dport_dev))
+ return 0;
+
+ uport_dev = dport_dev->parent;
+ if (!uport_dev) {
+ dev_warn(dev, "at %s no parent for dport: %s\n",
+ dev_name(iter), dev_name(dport_dev));
+ return -ENXIO;
+ }
+
+ dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n",
+ dev_name(iter), dev_name(dport_dev),
+ dev_name(uport_dev));
+ struct cxl_port *port __free(put_cxl_port) =
+ find_cxl_port_by_uport(uport_dev);
+ if (port) {
+ dev_dbg(&cxlmd->dev,
+ "found already registered port %s:%s\n",
+ dev_name(&port->dev),
+ dev_name(port->uport_dev));
+
+ /*
+ * RP port enumerated by cxl_acpi without dport will
+ * have the dport added here.
+ */
+ scoped_guard(device, &port->dev) {
+ dport = find_or_add_dport(port, dport_dev);
+ if (IS_ERR(dport)) {
+ if (PTR_ERR(dport) == -EAGAIN)
+ goto retry;
+ return PTR_ERR(dport);
+ }
+ }
+
+ rc = cxl_add_ep(dport, &cxlmd->dev);
+
+ /*
+ * If the endpoint already exists in the port's list,
+ * that's ok, it was added on a previous pass.
+ * Otherwise, retry in add_port_attach_ep() after taking
+ * the parent_port lock as the current port may be being
+ * reaped.
+ */
+ if (rc && rc != -EBUSY)
+ return rc;
+
+ cxl_gpf_port_setup(dport);
+
+ /* Any more ports to add between this one and the root? */
+ if (!dev_is_cxl_root_child(&port->dev))
+ continue;
+
+ return 0;
+ }
+
+ rc = add_port_attach_ep(cxlmd, uport_dev, dport_dev);
+ /* port missing, try to add parent */
+ if (rc == -EAGAIN)
+ continue;
+ /* failed to add ep or port */
+ if (rc)
+ return rc;
+ /* port added, new descendants possible, start over */
+ goto retry;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, "CXL");
+
+struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev,
+ struct cxl_dport **dport)
+{
+ return find_cxl_port(pdev->dev.parent, dport);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_pci_find_port, "CXL");
+
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+ struct cxl_dport **dport)
+{
+ return find_cxl_port(grandparent(&cxlmd->dev), dport);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, "CXL");
+
+static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd,
+ struct cxl_port *port)
+{
+ struct cxl_decoder *cxld = &cxlsd->cxld;
+ int i;
+
+ device_lock_assert(&port->dev);
+
+ if (xa_empty(&port->dports))
+ return 0;
+
+ guard(rwsem_write)(&cxl_rwsem.region);
+ for (i = 0; i < cxlsd->cxld.interleave_ways; i++) {
+ struct cxl_dport *dport = find_dport(port, cxld->target_map[i]);
+
+ if (!dport) {
+ /* dport may be activated later */
+ continue;
+ }
+ cxlsd->target[i] = dport;
+ }
+
+ return 0;
+}
+
+static struct lock_class_key cxl_decoder_key;
+
+/**
+ * cxl_decoder_init - Common decoder setup / initialization
+ * @port: owning port of this decoder
+ * @cxld: common decoder properties to initialize
+ *
+ * A port may contain one or more decoders. Each of those decoders
+ * enable some address space for CXL.mem utilization. A decoder is
+ * expected to be configured by the caller before registering via
+ * cxl_decoder_add()
+ */
+static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld)
+{
+ struct device *dev;
+ int rc;
+
+ rc = ida_alloc(&port->decoder_ida, GFP_KERNEL);
+ if (rc < 0)
+ return rc;
+
+ /* need parent to stick around to release the id */
+ get_device(&port->dev);
+ cxld->id = rc;
+
+ dev = &cxld->dev;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_decoder_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &port->dev;
+ dev->bus = &cxl_bus_type;
+
+ /* Pre initialize an "empty" decoder */
+ cxld->interleave_ways = 1;
+ cxld->interleave_granularity = PAGE_SIZE;
+ cxld->target_type = CXL_DECODER_HOSTONLYMEM;
+ cxld->hpa_range = (struct range) {
+ .start = 0,
+ .end = -1,
+ };
+
+ return 0;
+}
+
+static int cxl_switch_decoder_init(struct cxl_port *port,
+ struct cxl_switch_decoder *cxlsd,
+ int nr_targets)
+{
+ if (nr_targets > CXL_DECODER_MAX_INTERLEAVE)
+ return -EINVAL;
+
+ cxlsd->nr_targets = nr_targets;
+ return cxl_decoder_init(port, &cxlsd->cxld);
+}
+
+/**
+ * cxl_root_decoder_alloc - Allocate a root level decoder
+ * @port: owning CXL root of this decoder
+ * @nr_targets: static number of downstream targets
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add(). A
+ * 'CXL root' decoder is one that decodes from a top-level / static platform
+ * firmware description of CXL resources into a CXL standard decode
+ * topology.
+ */
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+ unsigned int nr_targets)
+{
+ struct cxl_root_decoder *cxlrd;
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_decoder *cxld;
+ int rc;
+
+ if (!is_cxl_root(port))
+ return ERR_PTR(-EINVAL);
+
+ cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets),
+ GFP_KERNEL);
+ if (!cxlrd)
+ return ERR_PTR(-ENOMEM);
+
+ cxlsd = &cxlrd->cxlsd;
+ rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+ if (rc) {
+ kfree(cxlrd);
+ return ERR_PTR(rc);
+ }
+
+ mutex_init(&cxlrd->range_lock);
+
+ cxld = &cxlsd->cxld;
+ cxld->dev.type = &cxl_decoder_root_type;
+ /*
+ * cxl_root_decoder_release() special cases negative ids to
+ * detect memregion_alloc() failures.
+ */
+ atomic_set(&cxlrd->region_id, -1);
+ rc = memregion_alloc(GFP_KERNEL);
+ if (rc < 0) {
+ put_device(&cxld->dev);
+ return ERR_PTR(rc);
+ }
+
+ atomic_set(&cxlrd->region_id, rc);
+ cxlrd->qos_class = CXL_QOS_CLASS_INVALID;
+ return cxlrd;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, "CXL");
+
+/**
+ * cxl_switch_decoder_alloc - Allocate a switch level decoder
+ * @port: owning CXL switch port of this decoder
+ * @nr_targets: max number of dynamically addressable downstream targets
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add(). A
+ * 'switch' decoder is any decoder that can be enumerated by PCIe
+ * topology and the HDM Decoder Capability. This includes the decoders
+ * that sit between Switch Upstream Ports / Switch Downstream Ports and
+ * Host Bridges / Root Ports.
+ */
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+ unsigned int nr_targets)
+{
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_decoder *cxld;
+ int rc;
+
+ if (is_cxl_root(port) || is_cxl_endpoint(port))
+ return ERR_PTR(-EINVAL);
+
+ cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL);
+ if (!cxlsd)
+ return ERR_PTR(-ENOMEM);
+
+ rc = cxl_switch_decoder_init(port, cxlsd, nr_targets);
+ if (rc) {
+ kfree(cxlsd);
+ return ERR_PTR(rc);
+ }
+
+ cxld = &cxlsd->cxld;
+ cxld->dev.type = &cxl_decoder_switch_type;
+ return cxlsd;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, "CXL");
+
+/**
+ * cxl_endpoint_decoder_alloc - Allocate an endpoint decoder
+ * @port: owning port of this decoder
+ *
+ * Return: A new cxl decoder to be registered by cxl_decoder_add()
+ */
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port)
+{
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_decoder *cxld;
+ int rc;
+
+ if (!is_cxl_endpoint(port))
+ return ERR_PTR(-EINVAL);
+
+ cxled = kzalloc(sizeof(*cxled), GFP_KERNEL);
+ if (!cxled)
+ return ERR_PTR(-ENOMEM);
+
+ cxled->pos = -1;
+ cxled->part = -1;
+ cxld = &cxled->cxld;
+ rc = cxl_decoder_init(port, cxld);
+ if (rc) {
+ kfree(cxled);
+ return ERR_PTR(rc);
+ }
+
+ cxld->dev.type = &cxl_decoder_endpoint_type;
+ return cxled;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, "CXL");
+
+/**
+ * cxl_decoder_add_locked - Add a decoder with targets
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
+ *
+ * Certain types of decoders may not have any targets. The main example of this
+ * is an endpoint device. A more awkward example is a hostbridge whose root
+ * ports get hot added (technically possible, though unlikely).
+ *
+ * This is the locked variant of cxl_decoder_add().
+ *
+ * Context: Process context. Expects the device lock of the port that owns the
+ * @cxld to be held.
+ *
+ * Return: Negative error code if the decoder wasn't properly configured; else
+ * returns 0.
+ */
+int cxl_decoder_add_locked(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port;
+ struct device *dev;
+ int rc;
+
+ if (WARN_ON_ONCE(!cxld))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(IS_ERR(cxld)))
+ return PTR_ERR(cxld);
+
+ if (cxld->interleave_ways < 1)
+ return -EINVAL;
+
+ dev = &cxld->dev;
+
+ port = to_cxl_port(cxld->dev.parent);
+ if (!is_endpoint_decoder(dev)) {
+ struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev);
+
+ rc = decoder_populate_targets(cxlsd, port);
+ if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) {
+ dev_err(&port->dev,
+ "Failed to populate active decoder targets\n");
+ return rc;
+ }
+ }
+
+ rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id);
+ if (rc)
+ return rc;
+
+ return device_add(dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, "CXL");
+
+/**
+ * cxl_decoder_add - Add a decoder with targets
+ * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc()
+ *
+ * This is the unlocked variant of cxl_decoder_add_locked().
+ * See cxl_decoder_add_locked().
+ *
+ * Context: Process context. Takes and releases the device lock of the port that
+ * owns the @cxld.
+ */
+int cxl_decoder_add(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port;
+
+ if (WARN_ON_ONCE(!cxld))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(IS_ERR(cxld)))
+ return PTR_ERR(cxld);
+
+ port = to_cxl_port(cxld->dev.parent);
+
+ guard(device)(&port->dev);
+ return cxl_decoder_add_locked(cxld);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, "CXL");
+
+static void cxld_unregister(void *dev)
+{
+ if (is_endpoint_decoder(dev))
+ cxl_decoder_detach(NULL, to_cxl_endpoint_decoder(dev), -1,
+ DETACH_INVALIDATE);
+
+ device_unregister(dev);
+}
+
+int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld)
+{
+ return devm_add_action_or_reset(host, cxld_unregister, &cxld->dev);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_decoder_autoremove, "CXL");
+
+/**
+ * __cxl_driver_register - register a driver for the cxl bus
+ * @cxl_drv: cxl driver structure to attach
+ * @owner: owning module/driver
+ * @modname: KBUILD_MODNAME for parent driver
+ */
+int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner,
+ const char *modname)
+{
+ if (!cxl_drv->probe) {
+ pr_debug("%s ->probe() must be specified\n", modname);
+ return -EINVAL;
+ }
+
+ if (!cxl_drv->name) {
+ pr_debug("%s ->name must be specified\n", modname);
+ return -EINVAL;
+ }
+
+ if (!cxl_drv->id) {
+ pr_debug("%s ->id must be specified\n", modname);
+ return -EINVAL;
+ }
+
+ cxl_drv->drv.bus = &cxl_bus_type;
+ cxl_drv->drv.owner = owner;
+ cxl_drv->drv.mod_name = modname;
+ cxl_drv->drv.name = cxl_drv->name;
+
+ return driver_register(&cxl_drv->drv);
+}
+EXPORT_SYMBOL_NS_GPL(__cxl_driver_register, "CXL");
+
+void cxl_driver_unregister(struct cxl_driver *cxl_drv)
+{
+ driver_unregister(&cxl_drv->drv);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_driver_unregister, "CXL");
+
+static int cxl_bus_uevent(const struct device *dev, struct kobj_uevent_env *env)
+{
+ return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT,
+ cxl_device_id(dev));
+}
+
+static int cxl_bus_match(struct device *dev, const struct device_driver *drv)
+{
+ return cxl_device_id(dev) == to_cxl_drv(drv)->id;
+}
+
+static int cxl_bus_probe(struct device *dev)
+{
+ int rc;
+
+ rc = to_cxl_drv(dev->driver)->probe(dev);
+ dev_dbg(dev, "probe: %d\n", rc);
+ return rc;
+}
+
+static void cxl_bus_remove(struct device *dev)
+{
+ struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver);
+
+ if (cxl_drv->remove)
+ cxl_drv->remove(dev);
+}
+
+static struct workqueue_struct *cxl_bus_wq;
+
+static int cxl_rescan_attach(struct device *dev, void *data)
+{
+ int rc = device_attach(dev);
+
+ dev_vdbg(dev, "rescan: %s\n", rc ? "attach" : "detached");
+
+ return 0;
+}
+
+static void cxl_bus_rescan_queue(struct work_struct *w)
+{
+ bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_rescan_attach);
+}
+
+void cxl_bus_rescan(void)
+{
+ static DECLARE_WORK(rescan_work, cxl_bus_rescan_queue);
+
+ queue_work(cxl_bus_wq, &rescan_work);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, "CXL");
+
+void cxl_bus_drain(void)
+{
+ drain_workqueue(cxl_bus_wq);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_bus_drain, "CXL");
+
+bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd)
+{
+ return queue_work(cxl_bus_wq, &cxlmd->detach_work);
+}
+EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, "CXL");
+
+static void add_latency(struct access_coordinate *c, long latency)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ c[i].write_latency += latency;
+ c[i].read_latency += latency;
+ }
+}
+
+static bool coordinates_valid(struct access_coordinate *c)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ if (c[i].read_bandwidth && c[i].write_bandwidth &&
+ c[i].read_latency && c[i].write_latency)
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ c[i].write_bandwidth = min(c[i].write_bandwidth, bw);
+ c[i].read_bandwidth = min(c[i].read_bandwidth, bw);
+ }
+}
+
+static void set_access_coordinates(struct access_coordinate *out,
+ struct access_coordinate *in)
+{
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++)
+ out[i] = in[i];
+}
+
+static bool parent_port_is_cxl_root(struct cxl_port *port)
+{
+ return is_cxl_root(to_cxl_port(port->dev.parent));
+}
+
+/**
+ * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports
+ * of CXL path
+ * @port: endpoint cxl_port
+ * @coord: output performance data
+ *
+ * Return: errno on failure, 0 on success.
+ */
+int cxl_endpoint_get_perf_coordinates(struct cxl_port *port,
+ struct access_coordinate *coord)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ struct access_coordinate c[] = {
+ {
+ .read_bandwidth = UINT_MAX,
+ .write_bandwidth = UINT_MAX,
+ },
+ {
+ .read_bandwidth = UINT_MAX,
+ .write_bandwidth = UINT_MAX,
+ },
+ };
+ struct cxl_port *iter = port;
+ struct cxl_dport *dport;
+ struct pci_dev *pdev;
+ struct device *dev;
+ unsigned int bw;
+ bool is_cxl_root;
+
+ if (!is_cxl_endpoint(port))
+ return -EINVAL;
+
+ /*
+ * Skip calculation for RCD. Expectation is HMAT already covers RCD case
+ * since RCH does not support hotplug.
+ */
+ if (cxlmd->cxlds->rcd)
+ return 0;
+
+ /*
+ * Exit the loop when the parent port of the current iter port is cxl
+ * root. The iterative loop starts at the endpoint and gathers the
+ * latency of the CXL link from the current device/port to the connected
+ * downstream port each iteration.
+ */
+ do {
+ dport = iter->parent_dport;
+ iter = to_cxl_port(iter->dev.parent);
+ is_cxl_root = parent_port_is_cxl_root(iter);
+
+ /*
+ * There's no valid access_coordinate for a root port since RPs do not
+ * have CDAT and therefore needs to be skipped.
+ */
+ if (!is_cxl_root) {
+ if (!coordinates_valid(dport->coord))
+ return -EINVAL;
+ cxl_coordinates_combine(c, c, dport->coord);
+ }
+ add_latency(c, dport->link_latency);
+ } while (!is_cxl_root);
+
+ dport = iter->parent_dport;
+ /* Retrieve HB coords */
+ if (!coordinates_valid(dport->coord))
+ return -EINVAL;
+ cxl_coordinates_combine(c, c, dport->coord);
+
+ dev = port->uport_dev->parent;
+ if (!dev_is_pci(dev))
+ return -ENODEV;
+
+ /* Get the calculated PCI paths bandwidth */
+ pdev = to_pci_dev(dev);
+ bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL);
+ if (bw == 0)
+ return -ENXIO;
+ bw /= BITS_PER_BYTE;
+
+ set_min_bandwidth(c, bw);
+ set_access_coordinates(coord, c);
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_endpoint_get_perf_coordinates, "CXL");
+
+int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
+ struct access_coordinate *c)
+{
+ struct cxl_dport *dport = port->parent_dport;
+
+ /* Check this port is connected to a switch DSP and not an RP */
+ if (parent_port_is_cxl_root(to_cxl_port(port->dev.parent)))
+ return -ENODEV;
+
+ if (!coordinates_valid(dport->coord))
+ return -EINVAL;
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ c[i].read_bandwidth = dport->coord[i].read_bandwidth;
+ c[i].write_bandwidth = dport->coord[i].write_bandwidth;
+ }
+
+ return 0;
+}
+
+/* for user tooling to ensure port disable work has completed */
+static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count)
+{
+ if (sysfs_streq(buf, "1")) {
+ flush_workqueue(cxl_bus_wq);
+ return count;
+ }
+
+ return -EINVAL;
+}
+
+static BUS_ATTR_WO(flush);
+
+static struct attribute *cxl_bus_attributes[] = {
+ &bus_attr_flush.attr,
+ NULL,
+};
+
+static struct attribute_group cxl_bus_attribute_group = {
+ .attrs = cxl_bus_attributes,
+};
+
+static const struct attribute_group *cxl_bus_attribute_groups[] = {
+ &cxl_bus_attribute_group,
+ NULL,
+};
+
+const struct bus_type cxl_bus_type = {
+ .name = "cxl",
+ .uevent = cxl_bus_uevent,
+ .match = cxl_bus_match,
+ .probe = cxl_bus_probe,
+ .remove = cxl_bus_remove,
+ .bus_groups = cxl_bus_attribute_groups,
+};
+EXPORT_SYMBOL_NS_GPL(cxl_bus_type, "CXL");
+
+static struct dentry *cxl_debugfs;
+
+struct dentry *cxl_debugfs_create_dir(const char *dir)
+{
+ return debugfs_create_dir(dir, cxl_debugfs);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, "CXL");
+
+static __init int cxl_core_init(void)
+{
+ int rc;
+
+ cxl_debugfs = debugfs_create_dir("cxl", NULL);
+
+ if (einj_cxl_is_initialized())
+ debugfs_create_file("einj_types", 0400, cxl_debugfs, NULL,
+ &einj_cxl_available_error_type_fops);
+
+ cxl_mbox_init();
+
+ rc = cxl_memdev_init();
+ if (rc)
+ return rc;
+
+ cxl_bus_wq = alloc_ordered_workqueue("cxl_port", 0);
+ if (!cxl_bus_wq) {
+ rc = -ENOMEM;
+ goto err_wq;
+ }
+
+ rc = bus_register(&cxl_bus_type);
+ if (rc)
+ goto err_bus;
+
+ rc = cxl_region_init();
+ if (rc)
+ goto err_region;
+
+ rc = cxl_ras_init();
+ if (rc)
+ goto err_ras;
+
+ return 0;
+
+err_ras:
+ cxl_region_exit();
+err_region:
+ bus_unregister(&cxl_bus_type);
+err_bus:
+ destroy_workqueue(cxl_bus_wq);
+err_wq:
+ cxl_memdev_exit();
+ return rc;
+}
+
+static void cxl_core_exit(void)
+{
+ cxl_ras_exit();
+ cxl_region_exit();
+ bus_unregister(&cxl_bus_type);
+ destroy_workqueue(cxl_bus_wq);
+ cxl_memdev_exit();
+ debugfs_remove_recursive(cxl_debugfs);
+}
+
+subsys_initcall(cxl_core_init);
+module_exit(cxl_core_exit);
+MODULE_DESCRIPTION("CXL: Core Compute Express Link support");
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS("CXL");
diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c
new file mode 100644
index 000000000000..2731ba3a0799
--- /dev/null
+++ b/drivers/cxl/core/ras.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2025 AMD Corporation. All rights reserved. */
+
+#include <linux/pci.h>
+#include <linux/aer.h>
+#include <cxl/event.h>
+#include <cxlmem.h>
+#include "trace.h"
+
+static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+ trace_cxl_port_aer_correctable_error(&pdev->dev, status);
+}
+
+static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ u32 fe;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe,
+ ras_cap.header_log);
+}
+
+static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.cor_status & ~ras_cap.cor_mask;
+
+ trace_cxl_aer_correctable_error(cxlmd, status);
+}
+
+static void
+cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd,
+ struct cxl_ras_capability_regs ras_cap)
+{
+ u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask;
+ u32 fe;
+
+ if (hweight32(status) > 1)
+ fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK,
+ ras_cap.cap_control));
+ else
+ fe = status;
+
+ trace_cxl_aer_uncorrectable_error(cxlmd, status, fe,
+ ras_cap.header_log);
+}
+
+static int match_memdev_by_parent(struct device *dev, const void *uport)
+{
+ if (is_cxl_memdev(dev) && dev->parent == uport)
+ return 1;
+ return 0;
+}
+
+static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data)
+{
+ unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device,
+ data->prot_err.agent_addr.function);
+ struct pci_dev *pdev __free(pci_dev_put) =
+ pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment,
+ data->prot_err.agent_addr.bus,
+ devfn);
+ struct cxl_memdev *cxlmd;
+ int port_type;
+
+ if (!pdev)
+ return;
+
+ port_type = pci_pcie_type(pdev);
+ if (port_type == PCI_EXP_TYPE_ROOT_PORT ||
+ port_type == PCI_EXP_TYPE_DOWNSTREAM ||
+ port_type == PCI_EXP_TYPE_UPSTREAM) {
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap);
+
+ return;
+ }
+
+ guard(device)(&pdev->dev);
+ if (!pdev->dev.driver)
+ return;
+
+ struct device *mem_dev __free(put_device) = bus_find_device(
+ &cxl_bus_type, NULL, pdev, match_memdev_by_parent);
+ if (!mem_dev)
+ return;
+
+ cxlmd = to_cxl_memdev(mem_dev);
+ if (data->severity == AER_CORRECTABLE)
+ cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap);
+ else
+ cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap);
+}
+
+static void cxl_cper_prot_err_work_fn(struct work_struct *work)
+{
+ struct cxl_cper_prot_err_work_data wd;
+
+ while (cxl_cper_prot_err_kfifo_get(&wd))
+ cxl_cper_handle_prot_err(&wd);
+}
+static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn);
+
+int cxl_ras_init(void)
+{
+ return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work);
+}
+
+void cxl_ras_exit(void)
+{
+ cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work);
+ cancel_work_sync(&cxl_cper_prot_err_work);
+}
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
new file mode 100644
index 000000000000..ae899f68551f
--- /dev/null
+++ b/drivers/cxl/core/region.c
@@ -0,0 +1,4003 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/memregion.h>
+#include <linux/genalloc.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/sort.h>
+#include <linux/idr.h>
+#include <linux/memory-tiers.h>
+#include <linux/string_choices.h>
+#include <cxlmem.h>
+#include <cxl.h>
+#include "core.h"
+
+/**
+ * DOC: cxl core region
+ *
+ * CXL Regions represent mapped memory capacity in system physical address
+ * space. Whereas the CXL Root Decoders identify the bounds of potential CXL
+ * Memory ranges, Regions represent the active mapped capacity by the HDM
+ * Decoder Capability structures throughout the Host Bridges, Switches, and
+ * Endpoints in the topology.
+ *
+ * Region configuration has ordering constraints. UUID may be set at any time
+ * but is only visible for persistent regions.
+ * 1. Interleave granularity
+ * 2. Interleave size
+ * 3. Decoder targets
+ */
+
+/*
+ * nodemask that sets per node when the access_coordinates for the node has
+ * been updated by the CXL memory hotplug notifier.
+ */
+static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
+
+static struct cxl_region *to_cxl_region(struct device *dev);
+
+#define __ACCESS_ATTR_RO(_level, _name) { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = _name##_access##_level##_show, \
+}
+
+#define ACCESS_DEVICE_ATTR_RO(level, name) \
+ struct device_attribute dev_attr_access##level##_##name = __ACCESS_ATTR_RO(level, name)
+
+#define ACCESS_ATTR_RO(level, attrib) \
+static ssize_t attrib##_access##level##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct cxl_region *cxlr = to_cxl_region(dev); \
+ \
+ if (cxlr->coord[level].attrib == 0) \
+ return -ENOENT; \
+ \
+ return sysfs_emit(buf, "%u\n", cxlr->coord[level].attrib); \
+} \
+static ACCESS_DEVICE_ATTR_RO(level, attrib)
+
+ACCESS_ATTR_RO(0, read_bandwidth);
+ACCESS_ATTR_RO(0, read_latency);
+ACCESS_ATTR_RO(0, write_bandwidth);
+ACCESS_ATTR_RO(0, write_latency);
+
+#define ACCESS_ATTR_DECLARE(level, attrib) \
+ (&dev_attr_access##level##_##attrib.attr)
+
+static struct attribute *access0_coordinate_attrs[] = {
+ ACCESS_ATTR_DECLARE(0, read_bandwidth),
+ ACCESS_ATTR_DECLARE(0, write_bandwidth),
+ ACCESS_ATTR_DECLARE(0, read_latency),
+ ACCESS_ATTR_DECLARE(0, write_latency),
+ NULL
+};
+
+ACCESS_ATTR_RO(1, read_bandwidth);
+ACCESS_ATTR_RO(1, read_latency);
+ACCESS_ATTR_RO(1, write_bandwidth);
+ACCESS_ATTR_RO(1, write_latency);
+
+static struct attribute *access1_coordinate_attrs[] = {
+ ACCESS_ATTR_DECLARE(1, read_bandwidth),
+ ACCESS_ATTR_DECLARE(1, write_bandwidth),
+ ACCESS_ATTR_DECLARE(1, read_latency),
+ ACCESS_ATTR_DECLARE(1, write_latency),
+ NULL
+};
+
+#define ACCESS_VISIBLE(level) \
+static umode_t cxl_region_access##level##_coordinate_visible( \
+ struct kobject *kobj, struct attribute *a, int n) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ struct cxl_region *cxlr = to_cxl_region(dev); \
+ \
+ if (a == &dev_attr_access##level##_read_latency.attr && \
+ cxlr->coord[level].read_latency == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_write_latency.attr && \
+ cxlr->coord[level].write_latency == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_read_bandwidth.attr && \
+ cxlr->coord[level].read_bandwidth == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_write_bandwidth.attr && \
+ cxlr->coord[level].write_bandwidth == 0) \
+ return 0; \
+ \
+ return a->mode; \
+}
+
+ACCESS_VISIBLE(0);
+ACCESS_VISIBLE(1);
+
+static const struct attribute_group cxl_region_access0_coordinate_group = {
+ .name = "access0",
+ .attrs = access0_coordinate_attrs,
+ .is_visible = cxl_region_access0_coordinate_visible,
+};
+
+static const struct attribute_group *get_cxl_region_access0_group(void)
+{
+ return &cxl_region_access0_coordinate_group;
+}
+
+static const struct attribute_group cxl_region_access1_coordinate_group = {
+ .name = "access1",
+ .attrs = access1_coordinate_attrs,
+ .is_visible = cxl_region_access1_coordinate_visible,
+};
+
+static const struct attribute_group *get_cxl_region_access1_group(void)
+{
+ return &cxl_region_access1_coordinate_group;
+}
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+ if (cxlr->mode != CXL_PARTMODE_PMEM)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%pUb\n", &p->uuid);
+}
+
+static int is_dup(struct device *match, void *data)
+{
+ struct cxl_region_params *p;
+ struct cxl_region *cxlr;
+ uuid_t *uuid = data;
+
+ if (!is_cxl_region(match))
+ return 0;
+
+ lockdep_assert_held(&cxl_rwsem.region);
+ cxlr = to_cxl_region(match);
+ p = &cxlr->params;
+
+ if (uuid_equal(&p->uuid, uuid)) {
+ dev_dbg(match, "already has uuid: %pUb\n", uuid);
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ uuid_t temp;
+ ssize_t rc;
+
+ if (len != UUID_STRING_LEN + 1)
+ return -EINVAL;
+
+ rc = uuid_parse(buf, &temp);
+ if (rc)
+ return rc;
+
+ if (uuid_is_null(&temp))
+ return -EINVAL;
+
+ ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &region_rwsem)))
+ return rc;
+
+ if (uuid_equal(&p->uuid, &temp))
+ return len;
+
+ if (p->state >= CXL_CONFIG_ACTIVE)
+ return -EBUSY;
+
+ rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
+ if (rc < 0)
+ return rc;
+
+ uuid_copy(&p->uuid, &temp);
+
+ return len;
+}
+static DEVICE_ATTR_RW(uuid);
+
+static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
+ struct cxl_region *cxlr)
+{
+ return xa_load(&port->regions, (unsigned long)cxlr);
+}
+
+static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
+{
+ if (!cpu_cache_has_invalidate_memregion()) {
+ if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
+ dev_info_once(
+ &cxlr->dev,
+ "Bypassing cpu_cache_invalidate_memregion() for testing!\n");
+ return 0;
+ }
+ dev_WARN(&cxlr->dev,
+ "Failed to synchronize CPU cache state\n");
+ return -ENXIO;
+ }
+
+ if (!cxlr->params.res)
+ return -ENXIO;
+ cpu_cache_invalidate_memregion(cxlr->params.res->start,
+ resource_size(cxlr->params.res));
+ return 0;
+}
+
+static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int i;
+
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return;
+
+ /*
+ * Before region teardown attempt to flush, evict any data cached for
+ * this region, or scream loudly about missing arch / platform support
+ * for CXL teardown.
+ */
+ cxl_region_invalidate_memregion(cxlr);
+
+ for (i = count - 1; i >= 0; i--) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *iter = cxled_to_port(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_ep *ep;
+
+ if (cxlds->rcd)
+ goto endpoint_reset;
+
+ while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+ iter = to_cxl_port(iter->dev.parent);
+
+ for (ep = cxl_ep_load(iter, cxlmd); iter;
+ iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+ struct cxl_region_ref *cxl_rr;
+ struct cxl_decoder *cxld;
+
+ cxl_rr = cxl_rr_load(iter, cxlr);
+ cxld = cxl_rr->decoder;
+ if (cxld->reset)
+ cxld->reset(cxld);
+ set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+ }
+
+endpoint_reset:
+ cxled->cxld.reset(&cxled->cxld);
+ set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+ }
+
+ /* all decoders associated with this region have been torn down */
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
+static int commit_decoder(struct cxl_decoder *cxld)
+{
+ struct cxl_switch_decoder *cxlsd = NULL;
+
+ if (cxld->commit)
+ return cxld->commit(cxld);
+
+ if (is_switch_decoder(&cxld->dev))
+ cxlsd = to_cxl_switch_decoder(&cxld->dev);
+
+ if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1,
+ "->commit() is required\n"))
+ return -ENXIO;
+ return 0;
+}
+
+static int cxl_region_decode_commit(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int i, rc = 0;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_region_ref *cxl_rr;
+ struct cxl_decoder *cxld;
+ struct cxl_port *iter;
+ struct cxl_ep *ep;
+
+ /* commit bottom up */
+ for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent)) {
+ cxl_rr = cxl_rr_load(iter, cxlr);
+ cxld = cxl_rr->decoder;
+ rc = commit_decoder(cxld);
+ if (rc)
+ break;
+ }
+
+ if (rc) {
+ /* programming @iter failed, teardown */
+ for (ep = cxl_ep_load(iter, cxlmd); ep && iter;
+ iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+ cxl_rr = cxl_rr_load(iter, cxlr);
+ cxld = cxl_rr->decoder;
+ if (cxld->reset)
+ cxld->reset(cxld);
+ }
+
+ cxled->cxld.reset(&cxled->cxld);
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ /* undo the targets that were successfully committed */
+ cxl_region_decode_reset(cxlr, i);
+ return rc;
+}
+
+static int queue_reset(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+
+ /* Already in the requested state? */
+ if (p->state < CXL_CONFIG_COMMIT)
+ return 0;
+
+ p->state = CXL_CONFIG_RESET_PENDING;
+
+ return 0;
+}
+
+static int __commit(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+
+ /* Already in the requested state? */
+ if (p->state >= CXL_CONFIG_COMMIT)
+ return 0;
+
+ /* Not ready to commit? */
+ if (p->state < CXL_CONFIG_ACTIVE)
+ return -ENXIO;
+
+ /*
+ * Invalidate caches before region setup to drop any speculative
+ * consumption of this address space
+ */
+ rc = cxl_region_invalidate_memregion(cxlr);
+ if (rc)
+ return rc;
+
+ rc = cxl_region_decode_commit(cxlr);
+ if (rc)
+ return rc;
+
+ p->state = CXL_CONFIG_COMMIT;
+
+ return 0;
+}
+
+static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ bool commit;
+ ssize_t rc;
+
+ rc = kstrtobool(buf, &commit);
+ if (rc)
+ return rc;
+
+ if (commit) {
+ rc = __commit(cxlr);
+ if (rc)
+ return rc;
+ return len;
+ }
+
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return -EPERM;
+
+ rc = queue_reset(cxlr);
+ if (rc)
+ return rc;
+
+ /*
+ * Unmap the region and depend the reset-pending state to ensure
+ * it does not go active again until post reset
+ */
+ device_release_driver(&cxlr->dev);
+
+ /*
+ * With the reset pending take cxl_rwsem.region unconditionally
+ * to ensure the reset gets handled before returning.
+ */
+ guard(rwsem_write)(&cxl_rwsem.region);
+
+ /*
+ * Revalidate that the reset is still pending in case another
+ * thread already handled this reset.
+ */
+ if (p->state == CXL_CONFIG_RESET_PENDING) {
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
+ p->state = CXL_CONFIG_ACTIVE;
+ }
+
+ return len;
+}
+
+static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
+}
+static DEVICE_ATTR_RW(commit);
+
+static ssize_t interleave_ways_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%d\n", p->interleave_ways);
+}
+
+static const struct attribute_group *get_cxl_region_target_group(void);
+
+static ssize_t interleave_ways_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ unsigned int val, save;
+ int rc;
+ u8 iw;
+
+ rc = kstrtouint(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ rc = ways_to_eiw(val, &iw);
+ if (rc)
+ return rc;
+
+ /*
+ * Even for x3, x6, and x12 interleaves the region interleave must be a
+ * power of 2 multiple of the host bridge interleave.
+ */
+ if (!is_power_of_2(val / cxld->interleave_ways) ||
+ (val % cxld->interleave_ways)) {
+ dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
+ return -EINVAL;
+ }
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+ return -EBUSY;
+
+ save = p->interleave_ways;
+ p->interleave_ways = val;
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+ if (rc) {
+ p->interleave_ways = save;
+ return rc;
+ }
+
+ return len;
+}
+static DEVICE_ATTR_RW(interleave_ways);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%d\n", p->interleave_granularity);
+}
+
+static ssize_t interleave_granularity_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ int rc, val;
+ u16 ig;
+
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ rc = granularity_to_eig(val, &ig);
+ if (rc)
+ return rc;
+
+ /*
+ * When the host-bridge is interleaved, disallow region granularity !=
+ * root granularity. Regions with a granularity less than the root
+ * interleave result in needing multiple endpoints to support a single
+ * slot in the interleave (possible to support in the future). Regions
+ * with a granularity greater than the root interleave result in invalid
+ * DPA translations (invalid to support).
+ */
+ if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
+ return -EINVAL;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+ return -EBUSY;
+
+ p->interleave_granularity = val;
+
+ return len;
+}
+static DEVICE_ATTR_RW(interleave_granularity);
+
+static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ u64 resource = -1ULL;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+
+ if (p->res)
+ resource = p->res->start;
+ return sysfs_emit(buf, "%#llx\n", resource);
+}
+static DEVICE_ATTR_RO(resource);
+
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ const char *desc;
+
+ if (cxlr->mode == CXL_PARTMODE_RAM)
+ desc = "ram";
+ else if (cxlr->mode == CXL_PARTMODE_PMEM)
+ desc = "pmem";
+ else
+ desc = "";
+
+ return sysfs_emit(buf, "%s\n", desc);
+}
+static DEVICE_ATTR_RO(mode);
+
+static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ struct resource *res;
+ u64 remainder = 0;
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ /* Nothing to do... */
+ if (p->res && resource_size(p->res) == size)
+ return 0;
+
+ /* To change size the old size must be freed first */
+ if (p->res)
+ return -EBUSY;
+
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+ return -EBUSY;
+
+ /* ways, granularity and uuid (if PMEM) need to be set before HPA */
+ if (!p->interleave_ways || !p->interleave_granularity ||
+ (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
+ return -ENXIO;
+
+ div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
+ if (remainder)
+ return -EINVAL;
+
+ res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
+ dev_name(&cxlr->dev));
+ if (IS_ERR(res)) {
+ dev_dbg(&cxlr->dev,
+ "HPA allocation error (%ld) for size:%pap in %s %pr\n",
+ PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res);
+ return PTR_ERR(res);
+ }
+
+ p->res = res;
+ p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+ return 0;
+}
+
+static void cxl_region_iomem_release(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+
+ if (device_is_registered(&cxlr->dev))
+ lockdep_assert_held_write(&cxl_rwsem.region);
+ if (p->res) {
+ /*
+ * Autodiscovered regions may not have been able to insert their
+ * resource.
+ */
+ if (p->res->parent)
+ remove_resource(p->res);
+ kfree(p->res);
+ p->res = NULL;
+ }
+}
+
+static int free_hpa(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ if (!p->res)
+ return 0;
+
+ if (p->state >= CXL_CONFIG_ACTIVE)
+ return -EBUSY;
+
+ cxl_region_iomem_release(cxlr);
+ p->state = CXL_CONFIG_IDLE;
+ return 0;
+}
+
+static ssize_t size_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ u64 val;
+ int rc;
+
+ rc = kstrtou64(buf, 0, &val);
+ if (rc)
+ return rc;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+
+ if (val)
+ rc = alloc_hpa(cxlr, val);
+ else
+ rc = free_hpa(cxlr);
+
+ if (rc)
+ return rc;
+
+ return len;
+}
+
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ u64 size = 0;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ if (p->res)
+ size = resource_size(p->res);
+ return sysfs_emit(buf, "%#llx\n", size);
+}
+static DEVICE_ATTR_RW(size);
+
+static ssize_t extended_linear_cache_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%#llx\n", p->cache_size);
+}
+static DEVICE_ATTR_RO(extended_linear_cache_size);
+
+static struct attribute *cxl_region_attrs[] = {
+ &dev_attr_uuid.attr,
+ &dev_attr_commit.attr,
+ &dev_attr_interleave_ways.attr,
+ &dev_attr_interleave_granularity.attr,
+ &dev_attr_resource.attr,
+ &dev_attr_size.attr,
+ &dev_attr_mode.attr,
+ &dev_attr_extended_linear_cache_size.attr,
+ NULL,
+};
+
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+
+ /*
+ * Support tooling that expects to find a 'uuid' attribute for all
+ * regions regardless of mode.
+ */
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
+ return 0444;
+
+ /*
+ * Don't display extended linear cache attribute if there is no
+ * extended linear cache.
+ */
+ if (a == &dev_attr_extended_linear_cache_size.attr &&
+ cxlr->params.cache_size == 0)
+ return 0;
+
+ return a->mode;
+}
+
+static const struct attribute_group cxl_region_group = {
+ .attrs = cxl_region_attrs,
+ .is_visible = cxl_region_visible,
+};
+
+static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+
+ if (pos >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+ p->interleave_ways);
+ return -ENXIO;
+ }
+
+ cxled = p->targets[pos];
+ if (!cxled)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
+}
+
+static int check_commit_order(struct device *dev, void *data)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ /*
+ * if port->commit_end is not the only free decoder, then out of
+ * order shutdown has occurred, block further allocations until
+ * that is resolved
+ */
+ if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0))
+ return -EBUSY;
+ return 0;
+}
+
+static int match_free_decoder(struct device *dev, const void *data)
+{
+ struct cxl_port *port = to_cxl_port(dev->parent);
+ struct cxl_decoder *cxld;
+ int rc;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+
+ if (cxld->id != port->commit_end + 1)
+ return 0;
+
+ if (cxld->region) {
+ dev_dbg(dev->parent,
+ "next decoder to commit (%s) is already reserved (%s)\n",
+ dev_name(dev), dev_name(&cxld->region->dev));
+ return 0;
+ }
+
+ rc = device_for_each_child_reverse_from(dev->parent, dev, NULL,
+ check_commit_order);
+ if (rc) {
+ dev_dbg(dev->parent,
+ "unable to allocate %s due to out of order shutdown\n",
+ dev_name(dev));
+ return 0;
+ }
+ return 1;
+}
+
+static bool spa_maps_hpa(const struct cxl_region_params *p,
+ const struct range *range)
+{
+ if (!p->res)
+ return false;
+
+ /*
+ * The extended linear cache region is constructed by a 1:1 ratio
+ * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
+ * CXL decoders at the high end of the SPA range.
+ */
+ return p->res->start + p->cache_size == range->start &&
+ p->res->end == range->end;
+}
+
+static int match_auto_decoder(struct device *dev, const void *data)
+{
+ const struct cxl_region_params *p = data;
+ struct cxl_decoder *cxld;
+ struct range *r;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ r = &cxld->hpa_range;
+
+ if (spa_maps_hpa(p, r))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region
+ * @port: a port in the ancestry of the endpoint implied by @cxled
+ * @cxled: endpoint decoder to be, or currently, mapped by @port
+ * @cxlr: region to establish, or validate, decode @port
+ *
+ * In the region creation path cxl_port_pick_region_decoder() is an
+ * allocator to find a free port. In the region assembly path, it is
+ * recalling the decoder that platform firmware picked for validation
+ * purposes.
+ *
+ * The result is recorded in a 'struct cxl_region_ref' in @port.
+ */
+static struct cxl_decoder *
+cxl_port_pick_region_decoder(struct cxl_port *port,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_region *cxlr)
+{
+ struct device *dev;
+
+ if (port == cxled_to_port(cxled))
+ return &cxled->cxld;
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+ dev = device_find_child(&port->dev, &cxlr->params,
+ match_auto_decoder);
+ else
+ dev = device_find_child(&port->dev, NULL, match_free_decoder);
+ if (!dev)
+ return NULL;
+ /*
+ * This decoder is pinned registered as long as the endpoint decoder is
+ * registered, and endpoint decoder unregistration holds the
+ * cxl_rwsem.region over unregister events, so no need to hold on to
+ * this extra reference.
+ */
+ put_device(dev);
+ return to_cxl_decoder(dev);
+}
+
+static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter,
+ struct cxl_decoder *cxld)
+{
+ struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter);
+ struct cxl_decoder *cxld_iter = rr->decoder;
+
+ /*
+ * Allow the out of order assembly of auto-discovered regions.
+ * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders
+ * in HPA order. Confirm that the decoder with the lesser HPA
+ * starting address has the lesser id.
+ */
+ dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n",
+ dev_name(&cxld->dev), cxld->id,
+ dev_name(&cxld_iter->dev), cxld_iter->id);
+
+ if (cxld_iter->id > cxld->id)
+ return true;
+
+ return false;
+}
+
+static struct cxl_region_ref *
+alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_decoder *cxld)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_region_ref *cxl_rr, *iter;
+ unsigned long index;
+ int rc;
+
+ xa_for_each(&port->regions, index, iter) {
+ struct cxl_region_params *ip = &iter->region->params;
+
+ if (!ip->res || ip->res->start < p->res->start)
+ continue;
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (auto_order_ok(port, iter->region, cxld))
+ continue;
+ }
+ dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n",
+ dev_name(&port->dev),
+ dev_name(&iter->region->dev), ip->res, p->res);
+
+ return ERR_PTR(-EBUSY);
+ }
+
+ cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL);
+ if (!cxl_rr)
+ return ERR_PTR(-ENOMEM);
+ cxl_rr->port = port;
+ cxl_rr->region = cxlr;
+ cxl_rr->nr_targets = 1;
+ xa_init(&cxl_rr->endpoints);
+
+ rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s: failed to track region reference: %d\n",
+ dev_name(&port->dev), rc);
+ kfree(cxl_rr);
+ return ERR_PTR(rc);
+ }
+
+ return cxl_rr;
+}
+
+static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
+{
+ struct cxl_region *cxlr = cxl_rr->region;
+ struct cxl_decoder *cxld = cxl_rr->decoder;
+
+ if (!cxld)
+ return;
+
+ dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
+ if (cxld->region == cxlr) {
+ cxld->region = NULL;
+ put_device(&cxlr->dev);
+ }
+}
+
+static void free_region_ref(struct cxl_region_ref *cxl_rr)
+{
+ struct cxl_port *port = cxl_rr->port;
+ struct cxl_region *cxlr = cxl_rr->region;
+
+ cxl_rr_free_decoder(cxl_rr);
+ xa_erase(&port->regions, (unsigned long)cxlr);
+ xa_destroy(&cxl_rr->endpoints);
+ kfree(cxl_rr);
+}
+
+static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
+ struct cxl_endpoint_decoder *cxled)
+{
+ int rc;
+ struct cxl_port *port = cxl_rr->port;
+ struct cxl_region *cxlr = cxl_rr->region;
+ struct cxl_decoder *cxld = cxl_rr->decoder;
+ struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled));
+
+ if (ep) {
+ rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep,
+ GFP_KERNEL);
+ if (rc)
+ return rc;
+ }
+ cxl_rr->nr_eps++;
+
+ if (!cxld->region) {
+ cxld->region = cxlr;
+ get_device(&cxlr->dev);
+ }
+
+ return 0;
+}
+
+static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_region_ref *cxl_rr,
+ struct cxl_decoder *cxld)
+{
+ if (cxld->region) {
+ dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
+ dev_name(&port->dev), dev_name(&cxld->dev),
+ dev_name(&cxld->region->dev));
+ return -EBUSY;
+ }
+
+ /*
+ * Endpoints should already match the region type, but backstop that
+ * assumption with an assertion. Switch-decoders change mapping-type
+ * based on what is mapped when they are assigned to a region.
+ */
+ dev_WARN_ONCE(&cxlr->dev,
+ port == cxled_to_port(cxled) &&
+ cxld->target_type != cxlr->type,
+ "%s:%s mismatch decoder type %d -> %d\n",
+ dev_name(&cxled_to_memdev(cxled)->dev),
+ dev_name(&cxld->dev), cxld->target_type, cxlr->type);
+ cxld->target_type = cxlr->type;
+ cxl_rr->decoder = cxld;
+ return 0;
+}
+
+static void cxl_region_set_lock(struct cxl_region *cxlr,
+ struct cxl_decoder *cxld)
+{
+ if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
+ set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
+/**
+ * cxl_port_attach_region() - track a region's interest in a port by endpoint
+ * @port: port to add a new region reference 'struct cxl_region_ref'
+ * @cxlr: region to attach to @port
+ * @cxled: endpoint decoder used to create or further pin a region reference
+ * @pos: interleave position of @cxled in @cxlr
+ *
+ * The attach event is an opportunity to validate CXL decode setup
+ * constraints and record metadata needed for programming HDM decoders,
+ * in particular decoder target lists.
+ *
+ * The steps are:
+ *
+ * - validate that there are no other regions with a higher HPA already
+ * associated with @port
+ * - establish a region reference if one is not already present
+ *
+ * - additionally allocate a decoder instance that will host @cxlr on
+ * @port
+ *
+ * - pin the region reference by the endpoint
+ * - account for how many entries in @port's target list are needed to
+ * cover all of the added endpoints.
+ */
+static int cxl_port_attach_region(struct cxl_port *port,
+ struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+ struct cxl_region_ref *cxl_rr;
+ bool nr_targets_inc = false;
+ struct cxl_decoder *cxld;
+ unsigned long index;
+ int rc = -EBUSY;
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ cxl_rr = cxl_rr_load(port, cxlr);
+ if (cxl_rr) {
+ struct cxl_ep *ep_iter;
+ int found = 0;
+
+ /*
+ * Walk the existing endpoints that have been attached to
+ * @cxlr at @port and see if they share the same 'next' port
+ * in the downstream direction. I.e. endpoints that share common
+ * upstream switch.
+ */
+ xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+ if (ep_iter == ep)
+ continue;
+ if (ep_iter->next == ep->next) {
+ found++;
+ break;
+ }
+ }
+
+ /*
+ * New target port, or @port is an endpoint port that always
+ * accounts its own local decode as a target.
+ */
+ if (!found || !ep->next) {
+ cxl_rr->nr_targets++;
+ nr_targets_inc = true;
+ }
+ } else {
+ struct cxl_decoder *cxld;
+
+ cxld = cxl_port_pick_region_decoder(port, cxled, cxlr);
+ if (!cxld) {
+ dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+ dev_name(&port->dev));
+ return -EBUSY;
+ }
+
+ cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld);
+ if (IS_ERR(cxl_rr)) {
+ dev_dbg(&cxlr->dev,
+ "%s: failed to allocate region reference\n",
+ dev_name(&port->dev));
+ return PTR_ERR(cxl_rr);
+ }
+ nr_targets_inc = true;
+
+ rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld);
+ if (rc)
+ goto out_erase;
+ }
+ cxld = cxl_rr->decoder;
+
+ /*
+ * the number of targets should not exceed the target_count
+ * of the decoder
+ */
+ if (is_switch_decoder(&cxld->dev)) {
+ struct cxl_switch_decoder *cxlsd;
+
+ cxlsd = to_cxl_switch_decoder(&cxld->dev);
+ if (cxl_rr->nr_targets > cxlsd->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s %s add: %s:%s @ %d overflows targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), dev_name(&cxlmd->dev),
+ dev_name(&cxled->cxld.dev), pos,
+ cxlsd->nr_targets);
+ rc = -ENXIO;
+ goto out_erase;
+ }
+ }
+
+ cxl_region_set_lock(cxlr, cxld);
+
+ rc = cxl_rr_ep_add(cxl_rr, cxled);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s: failed to track endpoint %s:%s reference\n",
+ dev_name(&port->dev), dev_name(&cxlmd->dev),
+ dev_name(&cxld->dev));
+ goto out_erase;
+ }
+
+ dev_dbg(&cxlr->dev,
+ "%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), dev_name(&cxlmd->dev),
+ dev_name(&cxled->cxld.dev), pos,
+ ep ? ep->next ? dev_name(ep->next->uport_dev) :
+ dev_name(&cxlmd->dev) :
+ "none",
+ cxl_rr->nr_eps, cxl_rr->nr_targets);
+
+ return 0;
+out_erase:
+ if (nr_targets_inc)
+ cxl_rr->nr_targets--;
+ if (cxl_rr->nr_eps == 0)
+ free_region_ref(cxl_rr);
+ return rc;
+}
+
+static void cxl_port_detach_region(struct cxl_port *port,
+ struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_region_ref *cxl_rr;
+ struct cxl_ep *ep = NULL;
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ cxl_rr = cxl_rr_load(port, cxlr);
+ if (!cxl_rr)
+ return;
+
+ /*
+ * Endpoint ports do not carry cxl_ep references, and they
+ * never target more than one endpoint by definition
+ */
+ if (cxl_rr->decoder == &cxled->cxld)
+ cxl_rr->nr_eps--;
+ else
+ ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled);
+ if (ep) {
+ struct cxl_ep *ep_iter;
+ unsigned long index;
+ int found = 0;
+
+ cxl_rr->nr_eps--;
+ xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
+ if (ep_iter->next == ep->next) {
+ found++;
+ break;
+ }
+ }
+ if (!found)
+ cxl_rr->nr_targets--;
+ }
+
+ if (cxl_rr->nr_eps == 0)
+ free_region_ref(cxl_rr);
+}
+
+static int check_last_peer(struct cxl_endpoint_decoder *cxled,
+ struct cxl_ep *ep, struct cxl_region_ref *cxl_rr,
+ int distance)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_region *cxlr = cxl_rr->region;
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled_peer;
+ struct cxl_port *port = cxl_rr->port;
+ struct cxl_memdev *cxlmd_peer;
+ struct cxl_ep *ep_peer;
+ int pos = cxled->pos;
+
+ /*
+ * If this position wants to share a dport with the last endpoint mapped
+ * then that endpoint, at index 'position - distance', must also be
+ * mapped by this dport.
+ */
+ if (pos < distance) {
+ dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+ return -ENXIO;
+ }
+ cxled_peer = p->targets[pos - distance];
+ cxlmd_peer = cxled_to_memdev(cxled_peer);
+ ep_peer = cxl_ep_load(port, cxlmd_peer);
+ if (ep->dport != ep_peer->dport) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
+ dev_name(&cxlmd_peer->dev),
+ dev_name(&cxled_peer->cxld.dev));
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+ struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+ unsigned int interleave_mask;
+ u8 eiw;
+ u16 eig;
+ int high_pos, low_pos;
+
+ if (!test_bit(iw, &cxlhdm->iw_cap_mask))
+ return -ENXIO;
+ /*
+ * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection),
+ * if eiw < 8:
+ * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw]
+ * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
+ *
+ * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the
+ * interleave bits are none.
+ *
+ * if eiw >= 8:
+ * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3
+ * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
+ *
+ * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the
+ * interleave bits are none.
+ */
+ ways_to_eiw(iw, &eiw);
+ if (eiw == 0 || eiw == 8)
+ return 0;
+
+ granularity_to_eig(ig, &eig);
+ if (eiw > 8)
+ high_pos = eiw + eig - 1;
+ else
+ high_pos = eiw + eig + 7;
+ low_pos = eig + 8;
+ interleave_mask = GENMASK(high_pos, low_pos);
+ if (interleave_mask & ~cxlhdm->interleave_mask)
+ return -ENXIO;
+
+ return 0;
+}
+
+static int cxl_port_setup_targets(struct cxl_port *port,
+ struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
+ struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
+ struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_decoder *cxld = cxl_rr->decoder;
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_port *iter = port;
+ u16 eig, peig;
+ u8 eiw, peiw;
+
+ /*
+ * While root level decoders support x3, x6, x12, switch level
+ * decoders only support powers of 2 up to x16.
+ */
+ if (!is_power_of_2(cxl_rr->nr_targets)) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ cxl_rr->nr_targets);
+ return -EINVAL;
+ }
+
+ cxlsd = to_cxl_switch_decoder(&cxld->dev);
+ if (cxl_rr->nr_targets_set) {
+ int i, distance = 1;
+ struct cxl_region_ref *cxl_rr_iter;
+
+ /*
+ * The "distance" between peer downstream ports represents which
+ * endpoint positions in the region interleave a given port can
+ * host.
+ *
+ * For example, at the root of a hierarchy the distance is
+ * always 1 as every index targets a different host-bridge. At
+ * each subsequent switch level those ports map every Nth region
+ * position where N is the width of the switch == distance.
+ */
+ do {
+ cxl_rr_iter = cxl_rr_load(iter, cxlr);
+ distance *= cxl_rr_iter->nr_targets;
+ iter = to_cxl_port(iter->dev.parent);
+ } while (!is_cxl_root(iter));
+ distance *= cxlrd->cxlsd.cxld.interleave_ways;
+
+ for (i = 0; i < cxl_rr->nr_targets_set; i++)
+ if (ep->dport == cxlsd->target[i]) {
+ rc = check_last_peer(cxled, ep, cxl_rr,
+ distance);
+ if (rc)
+ return rc;
+ goto out_target_set;
+ }
+ goto add_target;
+ }
+
+ if (is_cxl_root(parent_port)) {
+ /*
+ * Root decoder IG is always set to value in CFMWS which
+ * may be different than this region's IG. We can use the
+ * region's IG here since interleave_granularity_store()
+ * does not allow interleaved host-bridges with
+ * root IG != region IG.
+ */
+ parent_ig = p->interleave_granularity;
+ parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
+ /*
+ * For purposes of address bit routing, use power-of-2 math for
+ * switch ports.
+ */
+ if (!is_power_of_2(parent_iw))
+ parent_iw /= 3;
+ } else {
+ struct cxl_region_ref *parent_rr;
+ struct cxl_decoder *parent_cxld;
+
+ parent_rr = cxl_rr_load(parent_port, cxlr);
+ parent_cxld = parent_rr->decoder;
+ parent_ig = parent_cxld->interleave_granularity;
+ parent_iw = parent_cxld->interleave_ways;
+ }
+
+ rc = granularity_to_eig(parent_ig, &peig);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
+ dev_name(parent_port->uport_dev),
+ dev_name(&parent_port->dev), parent_ig);
+ return rc;
+ }
+
+ rc = ways_to_eiw(parent_iw, &peiw);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
+ dev_name(parent_port->uport_dev),
+ dev_name(&parent_port->dev), parent_iw);
+ return rc;
+ }
+
+ iw = cxl_rr->nr_targets;
+ rc = ways_to_eiw(iw, &eiw);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev), iw);
+ return rc;
+ }
+
+ /*
+ * Interleave granularity is a multiple of @parent_port granularity.
+ * Multiplier is the parent port interleave ways.
+ */
+ rc = granularity_to_eig(parent_ig * parent_iw, &eig);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s: invalid granularity calculation (%d * %d)\n",
+ dev_name(&parent_port->dev), parent_ig, parent_iw);
+ return rc;
+ }
+
+ rc = eig_to_granularity(eig, &ig);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ 256 << eig);
+ return rc;
+ }
+
+ if (iw > 8 || iw > cxlsd->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s:%s: ways: %d overflows targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), iw, cxlsd->nr_targets);
+ return -ENXIO;
+ }
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (cxld->interleave_ways != iw ||
+ (iw > 1 && cxld->interleave_granularity != ig) ||
+ !spa_maps_hpa(p, &cxld->hpa_range) ||
+ ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+ dev_err(&cxlr->dev,
+ "%s:%s %s expected iw: %d ig: %d %pr\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ __func__, iw, ig, p->res);
+ dev_err(&cxlr->dev,
+ "%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ __func__, cxld->interleave_ways,
+ cxld->interleave_granularity,
+ str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE),
+ cxld->hpa_range.start, cxld->hpa_range.end);
+ return -ENXIO;
+ }
+ } else {
+ rc = check_interleave_cap(cxld, iw, ig);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s iw: %d ig: %d is not supported\n",
+ dev_name(port->uport_dev),
+ dev_name(&port->dev), iw, ig);
+ return rc;
+ }
+
+ cxld->interleave_ways = iw;
+ cxld->interleave_granularity = ig;
+ cxld->hpa_range = (struct range) {
+ .start = p->res->start,
+ .end = p->res->end,
+ };
+ }
+ dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev),
+ dev_name(&port->dev), iw, ig);
+add_target:
+ if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s: targets full trying to add %s:%s at %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+ return -ENXIO;
+ }
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) {
+ dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxlsd->cxld.dev),
+ dev_name(ep->dport->dport_dev),
+ cxl_rr->nr_targets_set);
+ return -ENXIO;
+ }
+ } else {
+ cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
+ cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
+ }
+ cxl_rr->nr_targets_set++;
+out_target_set:
+ dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
+
+ return 0;
+}
+
+static void cxl_port_reset_targets(struct cxl_port *port,
+ struct cxl_region *cxlr)
+{
+ struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
+ struct cxl_decoder *cxld;
+
+ /*
+ * After the last endpoint has been detached the entire cxl_rr may now
+ * be gone.
+ */
+ if (!cxl_rr)
+ return;
+ cxl_rr->nr_targets_set = 0;
+
+ cxld = cxl_rr->decoder;
+ cxld->hpa_range = (struct range) {
+ .start = 0,
+ .end = -1,
+ };
+}
+
+static void cxl_region_teardown_targets(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_dev_state *cxlds;
+ struct cxl_memdev *cxlmd;
+ struct cxl_port *iter;
+ struct cxl_ep *ep;
+ int i;
+
+ /*
+ * In the auto-discovery case skip automatic teardown since the
+ * address space is already active
+ */
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+ return;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+
+ if (cxlds->rcd)
+ continue;
+
+ iter = cxled_to_port(cxled);
+ while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+ iter = to_cxl_port(iter->dev.parent);
+
+ for (ep = cxl_ep_load(iter, cxlmd); iter;
+ iter = ep->next, ep = cxl_ep_load(iter, cxlmd))
+ cxl_port_reset_targets(iter, cxlr);
+ }
+}
+
+static int cxl_region_setup_targets(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_dev_state *cxlds;
+ int i, rc, rch = 0, vh = 0;
+ struct cxl_memdev *cxlmd;
+ struct cxl_port *iter;
+ struct cxl_ep *ep;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+
+ /* validate that all targets agree on topology */
+ if (!cxlds->rcd) {
+ vh++;
+ } else {
+ rch++;
+ continue;
+ }
+
+ iter = cxled_to_port(cxled);
+ while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
+ iter = to_cxl_port(iter->dev.parent);
+
+ /*
+ * Descend the topology tree programming / validating
+ * targets while looking for conflicts.
+ */
+ for (ep = cxl_ep_load(iter, cxlmd); iter;
+ iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
+ rc = cxl_port_setup_targets(iter, cxlr, cxled);
+ if (rc) {
+ cxl_region_teardown_targets(cxlr);
+ return rc;
+ }
+ }
+ }
+
+ if (rch && vh) {
+ dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
+ cxl_region_teardown_targets(cxlr);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int cxl_region_validate_position(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ int pos)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_region_params *p = &cxlr->params;
+ int i;
+
+ if (pos < 0 || pos >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
+ p->interleave_ways);
+ return -ENXIO;
+ }
+
+ if (p->targets[pos] == cxled)
+ return 0;
+
+ if (p->targets[pos]) {
+ struct cxl_endpoint_decoder *cxled_target = p->targets[pos];
+ struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target);
+
+ dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n",
+ pos, dev_name(&cxlmd_target->dev),
+ dev_name(&cxled_target->cxld.dev));
+ return -EBUSY;
+ }
+
+ for (i = 0; i < p->interleave_ways; i++) {
+ struct cxl_endpoint_decoder *cxled_target;
+ struct cxl_memdev *cxlmd_target;
+
+ cxled_target = p->targets[i];
+ if (!cxled_target)
+ continue;
+
+ cxlmd_target = cxled_to_memdev(cxled_target);
+ if (cxlmd_target == cxlmd) {
+ dev_dbg(&cxlr->dev,
+ "%s already specified at position %d via: %s\n",
+ dev_name(&cxlmd->dev), pos,
+ dev_name(&cxled_target->cxld.dev));
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static int cxl_region_attach_position(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled,
+ const struct cxl_dport *dport, int pos)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+ struct cxl_decoder *cxld = &cxlsd->cxld;
+ int iw = cxld->interleave_ways;
+ struct cxl_port *iter;
+ int rc;
+
+ if (dport != cxlrd->cxlsd.target[pos % iw]) {
+ dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ dev_name(&cxlrd->cxlsd.cxld.dev));
+ return -ENXIO;
+ }
+
+ for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent)) {
+ rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
+ if (rc)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent))
+ cxl_port_detach_region(iter, cxlr, cxled);
+ return rc;
+}
+
+static int cxl_region_attach_auto(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos)
+{
+ struct cxl_region_params *p = &cxlr->params;
+
+ if (cxled->state != CXL_DECODER_STATE_AUTO) {
+ dev_err(&cxlr->dev,
+ "%s: unable to add decoder to autodetected region\n",
+ dev_name(&cxled->cxld.dev));
+ return -EINVAL;
+ }
+
+ if (pos >= 0) {
+ dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n",
+ dev_name(&cxled->cxld.dev), pos);
+ return -EINVAL;
+ }
+
+ if (p->nr_targets >= p->interleave_ways) {
+ dev_err(&cxlr->dev, "%s: no more target slots available\n",
+ dev_name(&cxled->cxld.dev));
+ return -ENXIO;
+ }
+
+ /*
+ * Temporarily record the endpoint decoder into the target array. Yes,
+ * this means that userspace can view devices in the wrong position
+ * before the region activates, and must be careful to understand when
+ * it might be racing region autodiscovery.
+ */
+ pos = p->nr_targets;
+ p->targets[pos] = cxled;
+ cxled->pos = pos;
+ p->nr_targets++;
+
+ return 0;
+}
+
+static int cmp_interleave_pos(const void *a, const void *b)
+{
+ struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a;
+ struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b;
+
+ return cxled_a->pos - cxled_b->pos;
+}
+
+static int match_switch_decoder_by_range(struct device *dev,
+ const void *data)
+{
+ struct cxl_switch_decoder *cxlsd;
+ const struct range *r1, *r2 = data;
+
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxlsd = to_cxl_switch_decoder(dev);
+ r1 = &cxlsd->cxld.hpa_range;
+
+ if (is_root_decoder(dev))
+ return range_contains(r1, r2);
+ return (r1->start == r2->start && r1->end == r2->end);
+}
+
+static int find_pos_and_ways(struct cxl_port *port, struct range *range,
+ int *pos, int *ways)
+{
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_port *parent;
+ struct device *dev;
+ int rc = -ENXIO;
+
+ parent = parent_port_of(port);
+ if (!parent)
+ return rc;
+
+ dev = device_find_child(&parent->dev, range,
+ match_switch_decoder_by_range);
+ if (!dev) {
+ dev_err(port->uport_dev,
+ "failed to find decoder mapping %#llx-%#llx\n",
+ range->start, range->end);
+ return rc;
+ }
+ cxlsd = to_cxl_switch_decoder(dev);
+ *ways = cxlsd->cxld.interleave_ways;
+
+ for (int i = 0; i < *ways; i++) {
+ if (cxlsd->target[i] == port->parent_dport) {
+ *pos = i;
+ rc = 0;
+ break;
+ }
+ }
+ put_device(dev);
+
+ if (rc)
+ dev_err(port->uport_dev,
+ "failed to find %s:%s in target list of %s\n",
+ dev_name(&port->dev),
+ dev_name(port->parent_dport->dport_dev),
+ dev_name(&cxlsd->cxld.dev));
+
+ return rc;
+}
+
+/**
+ * cxl_calc_interleave_pos() - calculate an endpoint position in a region
+ * @cxled: endpoint decoder member of given region
+ *
+ * The endpoint position is calculated by traversing the topology from
+ * the endpoint to the root decoder and iteratively applying this
+ * calculation:
+ *
+ * position = position * parent_ways + parent_pos;
+ *
+ * ...where @position is inferred from switch and root decoder target lists.
+ *
+ * Return: position >= 0 on success
+ * -ENXIO on failure
+ */
+static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_port *iter, *port = cxled_to_port(cxled);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct range *range = &cxled->cxld.hpa_range;
+ int parent_ways = 0, parent_pos = 0, pos = 0;
+ int rc;
+
+ /*
+ * Example: the expected interleave order of the 4-way region shown
+ * below is: mem0, mem2, mem1, mem3
+ *
+ * root_port
+ * / \
+ * host_bridge_0 host_bridge_1
+ * | | | |
+ * mem0 mem1 mem2 mem3
+ *
+ * In the example the calculator will iterate twice. The first iteration
+ * uses the mem position in the host-bridge and the ways of the host-
+ * bridge to generate the first, or local, position. The second
+ * iteration uses the host-bridge position in the root_port and the ways
+ * of the root_port to refine the position.
+ *
+ * A trace of the calculation per endpoint looks like this:
+ * mem0: pos = 0 * 2 + 0 mem2: pos = 0 * 2 + 0
+ * pos = 0 * 2 + 0 pos = 0 * 2 + 1
+ * pos: 0 pos: 1
+ *
+ * mem1: pos = 0 * 2 + 1 mem3: pos = 0 * 2 + 1
+ * pos = 1 * 2 + 0 pos = 1 * 2 + 1
+ * pos: 2 pos = 3
+ *
+ * Note that while this example is simple, the method applies to more
+ * complex topologies, including those with switches.
+ */
+
+ /* Iterate from endpoint to root_port refining the position */
+ for (iter = port; iter; iter = parent_port_of(iter)) {
+ if (is_cxl_root(iter))
+ break;
+
+ rc = find_pos_and_ways(iter, range, &parent_pos, &parent_ways);
+ if (rc)
+ return rc;
+
+ pos = pos * parent_ways + parent_pos;
+ }
+
+ dev_dbg(&cxlmd->dev,
+ "decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n",
+ dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent),
+ dev_name(&port->dev), range->start, range->end, pos);
+
+ return pos;
+}
+
+static int cxl_region_sort_targets(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int i, rc = 0;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+ cxled->pos = cxl_calc_interleave_pos(cxled);
+ /*
+ * Record that sorting failed, but still continue to calc
+ * cxled->pos so that follow-on code paths can reliably
+ * do p->targets[cxled->pos] to self-reference their entry.
+ */
+ if (cxled->pos < 0)
+ rc = -ENXIO;
+ }
+ /* Keep the cxlr target list in interleave position order */
+ sort(p->targets, p->nr_targets, sizeof(p->targets[0]),
+ cmp_interleave_pos, NULL);
+
+ dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful");
+ return rc;
+}
+
+static int cxl_region_attach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_port *ep_port, *root_port;
+ struct cxl_dport *dport;
+ int rc = -ENXIO;
+
+ rc = check_interleave_cap(&cxled->cxld, p->interleave_ways,
+ p->interleave_granularity);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n",
+ dev_name(&cxled->cxld.dev), p->interleave_ways,
+ p->interleave_granularity);
+ return rc;
+ }
+
+ if (cxled->part < 0) {
+ dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
+ return -ENODEV;
+ }
+
+ if (cxlds->part[cxled->part].mode != cxlr->mode) {
+ dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
+ dev_name(&cxled->cxld.dev), cxlr->mode);
+ return -EINVAL;
+ }
+
+ /* all full of members, or interleave config not established? */
+ if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_dbg(&cxlr->dev, "region already active\n");
+ return -EBUSY;
+ }
+
+ if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_dbg(&cxlr->dev, "interleave config missing\n");
+ return -ENXIO;
+ }
+
+ if (p->nr_targets >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "region already has %d endpoints\n",
+ p->nr_targets);
+ return -EINVAL;
+ }
+
+ ep_port = cxled_to_port(cxled);
+ root_port = cxlrd_to_port(cxlrd);
+ dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
+ if (!dport) {
+ dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ dev_name(cxlr->dev.parent));
+ return -ENXIO;
+ }
+
+ if (cxled->cxld.target_type != cxlr->type) {
+ dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ cxled->cxld.target_type, cxlr->type);
+ return -ENXIO;
+ }
+
+ if (!cxled->dpa_res) {
+ dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev));
+ return -ENXIO;
+ }
+
+ if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
+ resource_size(p->res)) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ (u64)resource_size(cxled->dpa_res), p->interleave_ways,
+ (u64)p->cache_size, (u64)resource_size(p->res));
+ return -EINVAL;
+ }
+
+ cxl_region_perf_data_calculate(cxlr, cxled);
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ int i;
+
+ rc = cxl_region_attach_auto(cxlr, cxled, pos);
+ if (rc)
+ return rc;
+
+ /* await more targets to arrive... */
+ if (p->nr_targets < p->interleave_ways)
+ return 0;
+
+ /*
+ * All targets are here, which implies all PCI enumeration that
+ * affects this region has been completed. Walk the topology to
+ * sort the devices into their relative region decode position.
+ */
+ rc = cxl_region_sort_targets(cxlr);
+ if (rc)
+ return rc;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ ep_port = cxled_to_port(cxled);
+ dport = cxl_find_dport_by_dev(root_port,
+ ep_port->host_bridge);
+ rc = cxl_region_attach_position(cxlr, cxlrd, cxled,
+ dport, i);
+ if (rc)
+ return rc;
+ }
+
+ rc = cxl_region_setup_targets(cxlr);
+ if (rc)
+ return rc;
+
+ /*
+ * If target setup succeeds in the autodiscovery case
+ * then the region is already committed.
+ */
+ p->state = CXL_CONFIG_COMMIT;
+ cxl_region_shared_upstream_bandwidth_update(cxlr);
+
+ return 0;
+ }
+
+ rc = cxl_region_validate_position(cxlr, cxled, pos);
+ if (rc)
+ return rc;
+
+ rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos);
+ if (rc)
+ return rc;
+
+ p->targets[pos] = cxled;
+ cxled->pos = pos;
+ p->nr_targets++;
+
+ if (p->nr_targets == p->interleave_ways) {
+ rc = cxl_region_setup_targets(cxlr);
+ if (rc)
+ return rc;
+ p->state = CXL_CONFIG_ACTIVE;
+ cxl_region_shared_upstream_bandwidth_update(cxlr);
+ }
+
+ cxled->cxld.interleave_ways = p->interleave_ways;
+ cxled->cxld.interleave_granularity = p->interleave_granularity;
+ cxled->cxld.hpa_range = (struct range) {
+ .start = p->res->start,
+ .end = p->res->end,
+ };
+
+ if (p->nr_targets != p->interleave_ways)
+ return 0;
+
+ /*
+ * Test the auto-discovery position calculator function
+ * against this successfully created user-defined region.
+ * A fail message here means that this interleave config
+ * will fail when presented as CXL_REGION_F_AUTO.
+ */
+ for (int i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ int test_pos;
+
+ test_pos = cxl_calc_interleave_pos(cxled);
+ dev_dbg(&cxled->cxld.dev,
+ "Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n",
+ (test_pos == cxled->pos) ? "success" : "fail",
+ test_pos, cxled->pos);
+ }
+
+ return 0;
+}
+
+static struct cxl_region *
+__cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ enum cxl_detach_mode mode)
+{
+ struct cxl_region_params *p;
+
+ lockdep_assert_held_write(&cxl_rwsem.region);
+
+ if (!cxled) {
+ p = &cxlr->params;
+
+ if (pos >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "position %d out of range %d\n",
+ pos, p->interleave_ways);
+ return NULL;
+ }
+
+ if (!p->targets[pos])
+ return NULL;
+ cxled = p->targets[pos];
+ } else {
+ cxlr = cxled->cxld.region;
+ if (!cxlr)
+ return NULL;
+ p = &cxlr->params;
+ }
+
+ if (mode == DETACH_INVALIDATE)
+ cxled->part = -1;
+
+ if (p->state > CXL_CONFIG_ACTIVE) {
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
+ p->state = CXL_CONFIG_ACTIVE;
+ }
+
+ for (struct cxl_port *iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent))
+ cxl_port_detach_region(iter, cxlr, cxled);
+
+ if (cxled->pos < 0 || cxled->pos >= p->interleave_ways ||
+ p->targets[cxled->pos] != cxled) {
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+ dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ cxled->pos);
+ return NULL;
+ }
+
+ if (p->state == CXL_CONFIG_ACTIVE) {
+ p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+ cxl_region_teardown_targets(cxlr);
+ }
+ p->targets[cxled->pos] = NULL;
+ p->nr_targets--;
+ cxled->cxld.hpa_range = (struct range) {
+ .start = 0,
+ .end = -1,
+ };
+
+ get_device(&cxlr->dev);
+ return cxlr;
+}
+
+/*
+ * Cleanup a decoder's interest in a region. There are 2 cases to
+ * handle, removing an unknown @cxled from a known position in a region
+ * (detach_target()) or removing a known @cxled from an unknown @cxlr
+ * (cxld_unregister())
+ *
+ * When the detachment finds a region release the region driver.
+ */
+int cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ enum cxl_detach_mode mode)
+{
+ struct cxl_region *detach;
+
+ /* when the decoder is being destroyed lock unconditionally */
+ if (mode == DETACH_INVALIDATE) {
+ guard(rwsem_write)(&cxl_rwsem.region);
+ detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
+ } else {
+ int rc;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+ detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
+ }
+
+ if (detach) {
+ device_release_driver(&detach->dev);
+ put_device(&detach->dev);
+ }
+ return 0;
+}
+
+static int __attach_target(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ unsigned int state)
+{
+ int rc;
+
+ if (state == TASK_INTERRUPTIBLE) {
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ return cxl_region_attach(cxlr, cxled, pos);
+ }
+ guard(rwsem_write)(&cxl_rwsem.region);
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ return cxl_region_attach(cxlr, cxled, pos);
+}
+
+static int attach_target(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ unsigned int state)
+{
+ int rc = __attach_target(cxlr, cxled, pos, state);
+
+ if (rc == 0)
+ return 0;
+
+ dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n",
+ dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
+ return rc;
+}
+
+static int detach_target(struct cxl_region *cxlr, int pos)
+{
+ return cxl_decoder_detach(cxlr, NULL, pos, DETACH_ONLY);
+}
+
+static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
+ size_t len)
+{
+ int rc;
+
+ if (sysfs_streq(buf, "\n"))
+ rc = detach_target(cxlr, pos);
+ else {
+ struct device *dev;
+
+ dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf);
+ if (!dev)
+ return -ENODEV;
+
+ if (!is_endpoint_decoder(dev)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos,
+ TASK_INTERRUPTIBLE);
+out:
+ put_device(dev);
+ }
+
+ if (rc < 0)
+ return rc;
+ return len;
+}
+
+#define TARGET_ATTR_RW(n) \
+static ssize_t target##n##_show( \
+ struct device *dev, struct device_attribute *attr, char *buf) \
+{ \
+ return show_targetN(to_cxl_region(dev), buf, (n)); \
+} \
+static ssize_t target##n##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return store_targetN(to_cxl_region(dev), buf, (n), len); \
+} \
+static DEVICE_ATTR_RW(target##n)
+
+TARGET_ATTR_RW(0);
+TARGET_ATTR_RW(1);
+TARGET_ATTR_RW(2);
+TARGET_ATTR_RW(3);
+TARGET_ATTR_RW(4);
+TARGET_ATTR_RW(5);
+TARGET_ATTR_RW(6);
+TARGET_ATTR_RW(7);
+TARGET_ATTR_RW(8);
+TARGET_ATTR_RW(9);
+TARGET_ATTR_RW(10);
+TARGET_ATTR_RW(11);
+TARGET_ATTR_RW(12);
+TARGET_ATTR_RW(13);
+TARGET_ATTR_RW(14);
+TARGET_ATTR_RW(15);
+
+static struct attribute *target_attrs[] = {
+ &dev_attr_target0.attr,
+ &dev_attr_target1.attr,
+ &dev_attr_target2.attr,
+ &dev_attr_target3.attr,
+ &dev_attr_target4.attr,
+ &dev_attr_target5.attr,
+ &dev_attr_target6.attr,
+ &dev_attr_target7.attr,
+ &dev_attr_target8.attr,
+ &dev_attr_target9.attr,
+ &dev_attr_target10.attr,
+ &dev_attr_target11.attr,
+ &dev_attr_target12.attr,
+ &dev_attr_target13.attr,
+ &dev_attr_target14.attr,
+ &dev_attr_target15.attr,
+ NULL,
+};
+
+static umode_t cxl_region_target_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+
+ if (n < p->interleave_ways)
+ return a->mode;
+ return 0;
+}
+
+static const struct attribute_group cxl_region_target_group = {
+ .attrs = target_attrs,
+ .is_visible = cxl_region_target_visible,
+};
+
+static const struct attribute_group *get_cxl_region_target_group(void)
+{
+ return &cxl_region_target_group;
+}
+
+static const struct attribute_group *region_groups[] = {
+ &cxl_base_attribute_group,
+ &cxl_region_group,
+ &cxl_region_target_group,
+ &cxl_region_access0_coordinate_group,
+ &cxl_region_access1_coordinate_group,
+ NULL,
+};
+
+static void cxl_region_release(struct device *dev)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ int id = atomic_read(&cxlrd->region_id);
+
+ /*
+ * Try to reuse the recently idled id rather than the cached
+ * next id to prevent the region id space from increasing
+ * unnecessarily.
+ */
+ if (cxlr->id < id)
+ if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) {
+ memregion_free(id);
+ goto out;
+ }
+
+ memregion_free(cxlr->id);
+out:
+ put_device(dev->parent);
+ kfree(cxlr);
+}
+
+const struct device_type cxl_region_type = {
+ .name = "cxl_region",
+ .release = cxl_region_release,
+ .groups = region_groups
+};
+
+bool is_cxl_region(struct device *dev)
+{
+ return dev->type == &cxl_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL");
+
+static struct cxl_region *to_cxl_region(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type,
+ "not a cxl_region device\n"))
+ return NULL;
+
+ return container_of(dev, struct cxl_region, dev);
+}
+
+static void unregister_region(void *_cxlr)
+{
+ struct cxl_region *cxlr = _cxlr;
+ struct cxl_region_params *p = &cxlr->params;
+ int i;
+
+ device_del(&cxlr->dev);
+
+ /*
+ * Now that region sysfs is shutdown, the parameter block is now
+ * read-only, so no need to hold the region rwsem to access the
+ * region parameters.
+ */
+ for (i = 0; i < p->interleave_ways; i++)
+ detach_target(cxlr, i);
+
+ cxl_region_iomem_release(cxlr);
+ put_device(&cxlr->dev);
+}
+
+static struct lock_class_key cxl_region_key;
+
+static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id)
+{
+ struct cxl_region *cxlr;
+ struct device *dev;
+
+ cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL);
+ if (!cxlr) {
+ memregion_free(id);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ dev = &cxlr->dev;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_region_key);
+ dev->parent = &cxlrd->cxlsd.cxld.dev;
+ /*
+ * Keep root decoder pinned through cxl_region_release to fixup
+ * region id allocations
+ */
+ get_device(dev->parent);
+ device_set_pm_not_required(dev);
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_region_type;
+ cxlr->id = id;
+ cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld);
+
+ return cxlr;
+}
+
+static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
+{
+ int cset = 0;
+ int rc;
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ if (cxlr->coord[i].read_bandwidth) {
+ node_update_perf_attrs(nid, &cxlr->coord[i], i);
+ cset++;
+ }
+ }
+
+ if (!cset)
+ return false;
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access0_group());
+ if (rc)
+ dev_dbg(&cxlr->dev, "Failed to update access0 group\n");
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access1_group());
+ if (rc)
+ dev_dbg(&cxlr->dev, "Failed to update access1 group\n");
+
+ return true;
+}
+
+static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
+ unsigned long action, void *arg)
+{
+ struct cxl_region *cxlr = container_of(nb, struct cxl_region,
+ node_notifier);
+ struct node_notify *nn = arg;
+ int nid = nn->nid;
+ int region_nid;
+
+ if (action != NODE_ADDED_FIRST_MEMORY)
+ return NOTIFY_DONE;
+
+ /*
+ * No need to hold cxl_rwsem.region; region parameters are stable
+ * within the cxl_region driver.
+ */
+ region_nid = phys_to_target_node(cxlr->params.res->start);
+ if (nid != region_nid)
+ return NOTIFY_DONE;
+
+ /* No action needed if node bit already set */
+ if (node_test_and_set(nid, nodemask_region_seen))
+ return NOTIFY_DONE;
+
+ if (!cxl_region_update_coordinates(cxlr, nid))
+ return NOTIFY_DONE;
+
+ return NOTIFY_OK;
+}
+
+static int cxl_region_calculate_adistance(struct notifier_block *nb,
+ unsigned long nid, void *data)
+{
+ struct cxl_region *cxlr = container_of(nb, struct cxl_region,
+ adist_notifier);
+ struct access_coordinate *perf;
+ int *adist = data;
+ int region_nid;
+
+ /*
+ * No need to hold cxl_rwsem.region; region parameters are stable
+ * within the cxl_region driver.
+ */
+ region_nid = phys_to_target_node(cxlr->params.res->start);
+ if (nid != region_nid)
+ return NOTIFY_OK;
+
+ perf = &cxlr->coord[ACCESS_COORDINATE_CPU];
+
+ if (mt_perf_to_adistance(perf, adist))
+ return NOTIFY_OK;
+
+ return NOTIFY_STOP;
+}
+
+/**
+ * devm_cxl_add_region - Adds a region to a decoder
+ * @cxlrd: root decoder
+ * @id: memregion id to create, or memregion_free() on failure
+ * @mode: mode for the endpoint decoders of this region
+ * @type: select whether this is an expander or accelerator (type-2 or type-3)
+ *
+ * This is the second step of region initialization. Regions exist within an
+ * address space which is mapped by a @cxlrd.
+ *
+ * Return: 0 if the region was added to the @cxlrd, else returns negative error
+ * code. The region will be named "regionZ" where Z is the unique region number.
+ */
+static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
+ int id,
+ enum cxl_partition_mode mode,
+ enum cxl_decoder_type type)
+{
+ struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+ struct cxl_region *cxlr;
+ struct device *dev;
+ int rc;
+
+ cxlr = cxl_region_alloc(cxlrd, id);
+ if (IS_ERR(cxlr))
+ return cxlr;
+ cxlr->mode = mode;
+ cxlr->type = type;
+
+ dev = &cxlr->dev;
+ rc = dev_set_name(dev, "region%d", id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr);
+ if (rc)
+ return ERR_PTR(rc);
+
+ dev_dbg(port->uport_dev, "%s: created %s\n",
+ dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
+ return cxlr;
+
+err:
+ put_device(dev);
+ return ERR_PTR(rc);
+}
+
+static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf)
+{
+ return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
+}
+
+static ssize_t create_pmem_region_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static ssize_t create_ram_region_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
+
+static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
+ enum cxl_partition_mode mode, int id)
+{
+ int rc;
+
+ switch (mode) {
+ case CXL_PARTMODE_RAM:
+ case CXL_PARTMODE_PMEM:
+ break;
+ default:
+ dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
+ return ERR_PTR(-EINVAL);
+ }
+
+ rc = memregion_alloc(GFP_KERNEL);
+ if (rc < 0)
+ return ERR_PTR(rc);
+
+ if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
+ memregion_free(rc);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM);
+}
+
+static ssize_t create_region_store(struct device *dev, const char *buf,
+ size_t len, enum cxl_partition_mode mode)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+ struct cxl_region *cxlr;
+ int rc, id;
+
+ rc = sscanf(buf, "region%d\n", &id);
+ if (rc != 1)
+ return -EINVAL;
+
+ cxlr = __create_region(cxlrd, mode, id);
+ if (IS_ERR(cxlr))
+ return PTR_ERR(cxlr);
+
+ return len;
+}
+
+static ssize_t create_pmem_region_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
+}
+DEVICE_ATTR_RW(create_pmem_region);
+
+static ssize_t create_ram_region_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
+}
+DEVICE_ATTR_RW(create_ram_region);
+
+static ssize_t region_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+
+ if (cxld->region)
+ return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
+ return sysfs_emit(buf, "\n");
+}
+DEVICE_ATTR_RO(region);
+
+static struct cxl_region *
+cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
+{
+ struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+ struct device *region_dev;
+
+ region_dev = device_find_child_by_name(&cxld->dev, name);
+ if (!region_dev)
+ return ERR_PTR(-ENODEV);
+
+ return to_cxl_region(region_dev);
+}
+
+static ssize_t delete_region_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+ struct cxl_port *port = to_cxl_port(dev->parent);
+ struct cxl_region *cxlr;
+
+ cxlr = cxl_find_region_by_name(cxlrd, buf);
+ if (IS_ERR(cxlr))
+ return PTR_ERR(cxlr);
+
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
+ put_device(&cxlr->dev);
+
+ return len;
+}
+DEVICE_ATTR_WO(delete_region);
+
+static void cxl_pmem_region_release(struct device *dev)
+{
+ struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+ int i;
+
+ for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+ struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd;
+
+ put_device(&cxlmd->dev);
+ }
+
+ kfree(cxlr_pmem);
+}
+
+static const struct attribute_group *cxl_pmem_region_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+const struct device_type cxl_pmem_region_type = {
+ .name = "cxl_pmem_region",
+ .release = cxl_pmem_region_release,
+ .groups = cxl_pmem_region_attribute_groups,
+};
+
+bool is_cxl_pmem_region(struct device *dev)
+{
+ return dev->type == &cxl_pmem_region_type;
+}
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL");
+
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev),
+ "not a cxl_pmem_region device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_pmem_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL");
+
+struct cxl_poison_context {
+ struct cxl_port *port;
+ int part;
+ u64 offset;
+};
+
+static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
+ struct cxl_poison_context *ctx)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ const struct resource *res;
+ struct resource *p, *last;
+ u64 offset, length;
+ int rc = 0;
+
+ if (ctx->part < 0)
+ return 0;
+
+ /*
+ * Collect poison for the remaining unmapped resources after
+ * poison is collected by committed endpoints decoders.
+ */
+ for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
+ res = &cxlds->part[i].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last)
+ offset = last->end + 1;
+ else
+ offset = res->start;
+ length = res->end - offset + 1;
+ if (!length)
+ break;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
+ continue;
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+static int poison_by_decoder(struct device *dev, void *arg)
+{
+ struct cxl_poison_context *ctx = arg;
+ struct cxl_endpoint_decoder *cxled;
+ enum cxl_partition_mode mode;
+ struct cxl_dev_state *cxlds;
+ struct cxl_memdev *cxlmd;
+ u64 offset, length;
+ int rc = 0;
+
+ if (!is_endpoint_decoder(dev))
+ return rc;
+
+ cxled = to_cxl_endpoint_decoder(dev);
+ if (!cxled->dpa_res)
+ return rc;
+
+ cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+ mode = cxlds->part[cxled->part].mode;
+
+ if (cxled->skip) {
+ offset = cxled->dpa_res->start - cxled->skip;
+ length = cxled->skip;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
+ rc = 0;
+ if (rc)
+ return rc;
+ }
+
+ offset = cxled->dpa_res->start;
+ length = cxled->dpa_res->end - offset + 1;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
+ rc = 0;
+ if (rc)
+ return rc;
+
+ /* Iterate until commit_end is reached */
+ if (cxled->cxld.id == ctx->port->commit_end) {
+ ctx->offset = cxled->dpa_res->end + 1;
+ ctx->part = cxled->part;
+ return 1;
+ }
+
+ return 0;
+}
+
+int cxl_get_poison_by_endpoint(struct cxl_port *port)
+{
+ struct cxl_poison_context ctx;
+ int rc = 0;
+
+ ctx = (struct cxl_poison_context) {
+ .port = port,
+ .part = -1,
+ };
+
+ rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
+ if (rc == 1)
+ rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev),
+ &ctx);
+
+ return rc;
+}
+
+struct cxl_dpa_to_region_context {
+ struct cxl_region *cxlr;
+ u64 dpa;
+};
+
+static int __cxl_dpa_to_region(struct device *dev, void *arg)
+{
+ struct cxl_dpa_to_region_context *ctx = arg;
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_region *cxlr;
+ u64 dpa = ctx->dpa;
+
+ if (!is_endpoint_decoder(dev))
+ return 0;
+
+ cxled = to_cxl_endpoint_decoder(dev);
+ if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
+ return 0;
+
+ if (!cxl_resource_contains_addr(cxled->dpa_res, dpa))
+ return 0;
+
+ /*
+ * Stop the region search (return 1) when an endpoint mapping is
+ * found. The region may not be fully constructed so offering
+ * the cxlr in the context structure is not guaranteed.
+ */
+ cxlr = cxled->cxld.region;
+ if (cxlr)
+ dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
+ dev_name(&cxlr->dev));
+ else
+ dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa,
+ dev_name(dev));
+
+ ctx->cxlr = cxlr;
+
+ return 1;
+}
+
+struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
+{
+ struct cxl_dpa_to_region_context ctx;
+ struct cxl_port *port;
+
+ ctx = (struct cxl_dpa_to_region_context) {
+ .dpa = dpa,
+ };
+ port = cxlmd->endpoint;
+ if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
+ device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
+
+ return ctx.cxlr;
+}
+
+static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int gran = p->interleave_granularity;
+ int ways = p->interleave_ways;
+ u64 offset;
+
+ /* Is the hpa in an expected chunk for its pos(-ition) */
+ offset = hpa - p->res->start;
+ offset = do_div(offset, gran * ways);
+ if ((offset >= pos * gran) && (offset < (pos + 1) * gran))
+ return true;
+
+ dev_dbg(&cxlr->dev,
+ "Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa);
+
+ return false;
+}
+
+#define CXL_POS_ZERO 0
+/**
+ * cxl_validate_translation_params
+ * @eiw: encoded interleave ways
+ * @eig: encoded interleave granularity
+ * @pos: position in interleave
+ *
+ * Callers pass CXL_POS_ZERO when no position parameter needs validating.
+ *
+ * Returns: 0 on success, -EINVAL on first invalid parameter
+ */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
+{
+ int ways, gran;
+
+ if (eiw_to_ways(eiw, &ways)) {
+ pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
+ return -EINVAL;
+ }
+ if (eig_to_granularity(eig, &gran)) {
+ pr_debug("%s: invalid eig=%u\n", __func__, eig);
+ return -EINVAL;
+ }
+ if (pos < 0 || pos >= ways) {
+ pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos,
+ ways);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
+
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
+{
+ u64 dpa_offset, bits_lower, bits_upper, temp;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
+ * Lower bits [IG+7:0] pass through unchanged
+ * (eiw < 8)
+ * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
+ * Clear the position bits to isolate upper section, then
+ * reverse the left shift by eiw that occurred during DPA->HPA
+ * (eiw >= 8)
+ * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
+ * Extract upper bits from the correct bit range and divide by 3
+ * to recover the original DPA upper bits
+ */
+ bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
+ if (eiw < 8) {
+ temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
+ dpa_offset = temp >> eiw;
+ } else {
+ bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
+ dpa_offset = bits_upper << (eig + 8);
+ }
+ dpa_offset |= bits_lower;
+
+ return dpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
+
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
+{
+ unsigned int ways = 0;
+ u64 shifted, rem;
+ int pos, ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ret;
+
+ if (!eiw)
+ /* position is 0 if no interleaving */
+ return 0;
+
+ /*
+ * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
+ * eiw < 8
+ * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
+ * Per spec "remove IW bits starting with bit position IG+8"
+ * eiw >= 8
+ * Position is not explicitly stored in HPA_OFFSET bits. It is
+ * derived from the modulo operation of the upper bits using
+ * the total number of interleave ways.
+ */
+ if (eiw < 8) {
+ pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
+ } else {
+ shifted = hpa_offset >> (eig + 8);
+ eiw_to_ways(eiw, &ways);
+ div64_u64_rem(shifted, ways, &rem);
+ pos = rem;
+ }
+
+ return pos;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
+
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
+{
+ u64 mask_upper, hpa_offset, bits_upper;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, pos);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * The device position in the region interleave set was removed
+ * from the offset at HPA->DPA translation. To reconstruct the
+ * HPA, place the 'pos' in the offset.
+ *
+ * The placement of 'pos' in the HPA is determined by interleave
+ * ways and granularity and is defined in the CXL Spec 3.0 Section
+ * 8.2.4.19.13 Implementation Note: Device Decode Logic
+ */
+
+ mask_upper = GENMASK_ULL(51, eig + 8);
+
+ if (eiw < 8) {
+ hpa_offset = (dpa_offset & mask_upper) << eiw;
+ hpa_offset |= pos << (eig + 8);
+ } else {
+ bits_upper = (dpa_offset & mask_upper) >> (eig + 8);
+ bits_upper = bits_upper * 3;
+ hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8);
+ }
+
+ /* The lower bits remain unchanged */
+ hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
+
+ return hpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
+
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+ u64 dpa)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled = NULL;
+ u64 dpa_offset, hpa_offset, hpa;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ for (int i = 0; i < p->nr_targets; i++) {
+ if (cxlmd == cxled_to_memdev(p->targets[i])) {
+ cxled = p->targets[i];
+ break;
+ }
+ }
+ if (!cxled)
+ return ULLONG_MAX;
+
+ pos = cxled->pos;
+ ways_to_eiw(p->interleave_ways, &eiw);
+ granularity_to_eig(p->interleave_granularity, &eig);
+
+ dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
+
+ /* Apply the hpa_offset to the region base address */
+ hpa = hpa_offset + p->res->start + p->cache_size;
+
+ /* Root decoder translation overrides typical modulo decode */
+ if (cxlrd->ops.hpa_to_spa)
+ hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
+
+ if (!cxl_resource_contains_addr(p->res, hpa)) {
+ dev_dbg(&cxlr->dev,
+ "Addr trans fail: hpa 0x%llx not in region\n", hpa);
+ return ULLONG_MAX;
+ }
+
+ /* Simple chunk check, by pos & gran, only applies to modulo decodes */
+ if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
+ return ULLONG_MAX;
+
+ return hpa;
+}
+
+struct dpa_result {
+ struct cxl_memdev *cxlmd;
+ u64 dpa;
+};
+
+static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
+ struct dpa_result *result)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_endpoint_decoder *cxled;
+ u64 hpa, hpa_offset, dpa_offset;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ lockdep_assert_held(&cxl_rwsem.region);
+ lockdep_assert_held(&cxl_rwsem.dpa);
+
+ /* Input validation ensures valid ways and gran */
+ granularity_to_eig(p->interleave_granularity, &eig);
+ ways_to_eiw(p->interleave_ways, &eiw);
+
+ /*
+ * If the root decoder has SPA to CXL HPA callback, use it. Otherwise
+ * CXL HPA is assumed to equal SPA.
+ */
+ if (cxlrd->ops.spa_to_hpa) {
+ hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
+ hpa_offset = hpa - p->res->start;
+ } else {
+ hpa_offset = offset;
+ }
+
+ pos = cxl_calculate_position(hpa_offset, eiw, eig);
+ if (pos < 0 || pos >= p->nr_targets) {
+ dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
+ pos, p->nr_targets);
+ return -ENXIO;
+ }
+
+ dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
+
+ /* Look-up and return the result: a memdev and a DPA */
+ for (int i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ if (cxled->pos != pos)
+ continue;
+ result->cxlmd = cxled_to_memdev(cxled);
+ result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset;
+
+ return 0;
+ }
+ dev_err(&cxlr->dev, "No device found for position %d\n", pos);
+
+ return -ENXIO;
+}
+
+static struct lock_class_key cxl_pmem_region_key;
+
+static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct device *dev;
+ int i;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return -ENXIO;
+
+ struct cxl_pmem_region *cxlr_pmem __free(kfree) =
+ kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), GFP_KERNEL);
+ if (!cxlr_pmem)
+ return -ENOMEM;
+
+ cxlr_pmem->hpa_range.start = p->res->start;
+ cxlr_pmem->hpa_range.end = p->res->end;
+
+ /* Snapshot the region configuration underneath the cxl_rwsem.region */
+ cxlr_pmem->nr_mappings = p->nr_targets;
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+
+ /*
+ * Regions never span CXL root devices, so by definition the
+ * bridge for one device is the same for all.
+ */
+ if (i == 0) {
+ cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint);
+ if (!cxl_nvb)
+ return -ENODEV;
+ cxlr->cxl_nvb = cxl_nvb;
+ }
+ m->cxlmd = cxlmd;
+ get_device(&cxlmd->dev);
+ m->start = cxled->dpa_res->start;
+ m->size = resource_size(cxled->dpa_res);
+ m->position = i;
+ }
+
+ dev = &cxlr_pmem->dev;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &cxlr->dev;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_pmem_region_type;
+ cxlr_pmem->cxlr = cxlr;
+ cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem);
+
+ return 0;
+}
+
+static void cxl_dax_region_release(struct device *dev)
+{
+ struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev);
+
+ kfree(cxlr_dax);
+}
+
+static const struct attribute_group *cxl_dax_region_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+const struct device_type cxl_dax_region_type = {
+ .name = "cxl_dax_region",
+ .release = cxl_dax_region_release,
+ .groups = cxl_dax_region_attribute_groups,
+};
+
+static bool is_cxl_dax_region(struct device *dev)
+{
+ return dev->type == &cxl_dax_region_type;
+}
+
+struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev),
+ "not a cxl_dax_region device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_dax_region, dev);
+}
+EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL");
+
+static struct lock_class_key cxl_dax_region_key;
+
+static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_dax_region *cxlr_dax;
+ struct device *dev;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return ERR_PTR(-ENXIO);
+
+ cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL);
+ if (!cxlr_dax)
+ return ERR_PTR(-ENOMEM);
+
+ cxlr_dax->hpa_range.start = p->res->start;
+ cxlr_dax->hpa_range.end = p->res->end;
+
+ dev = &cxlr_dax->dev;
+ cxlr_dax->cxlr = cxlr;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &cxlr->dev;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_dax_region_type;
+
+ return cxlr_dax;
+}
+
+static void cxlr_pmem_unregister(void *_cxlr_pmem)
+{
+ struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem;
+ struct cxl_region *cxlr = cxlr_pmem->cxlr;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+ /*
+ * Either the bridge is in ->remove() context under the device_lock(),
+ * or cxlr_release_nvdimm() is cancelling the bridge's release action
+ * for @cxlr_pmem and doing it itself (while manually holding the bridge
+ * lock).
+ */
+ device_lock_assert(&cxl_nvb->dev);
+ cxlr->cxlr_pmem = NULL;
+ cxlr_pmem->cxlr = NULL;
+ device_unregister(&cxlr_pmem->dev);
+}
+
+static void cxlr_release_nvdimm(void *_cxlr)
+{
+ struct cxl_region *cxlr = _cxlr;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+ scoped_guard(device, &cxl_nvb->dev) {
+ if (cxlr->cxlr_pmem)
+ devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister,
+ cxlr->cxlr_pmem);
+ }
+ cxlr->cxl_nvb = NULL;
+ put_device(&cxl_nvb->dev);
+}
+
+/**
+ * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge
+ * @cxlr: parent CXL region for this pmem region bridge device
+ *
+ * Return: 0 on success negative error code on failure.
+ */
+static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
+{
+ struct cxl_pmem_region *cxlr_pmem;
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct device *dev;
+ int rc;
+
+ rc = cxl_pmem_region_alloc(cxlr);
+ if (rc)
+ return rc;
+ cxlr_pmem = cxlr->cxlr_pmem;
+ cxl_nvb = cxlr->cxl_nvb;
+
+ dev = &cxlr_pmem->dev;
+ rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+ dev_name(dev));
+
+ scoped_guard(device, &cxl_nvb->dev) {
+ if (cxl_nvb->dev.driver)
+ rc = devm_add_action_or_reset(&cxl_nvb->dev,
+ cxlr_pmem_unregister,
+ cxlr_pmem);
+ else
+ rc = -ENXIO;
+ }
+
+ if (rc)
+ goto err_bridge;
+
+ /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */
+ return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr);
+
+err:
+ put_device(dev);
+err_bridge:
+ put_device(&cxl_nvb->dev);
+ cxlr->cxl_nvb = NULL;
+ return rc;
+}
+
+static void cxlr_dax_unregister(void *_cxlr_dax)
+{
+ struct cxl_dax_region *cxlr_dax = _cxlr_dax;
+
+ device_unregister(&cxlr_dax->dev);
+}
+
+static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
+{
+ struct cxl_dax_region *cxlr_dax;
+ struct device *dev;
+ int rc;
+
+ cxlr_dax = cxl_dax_region_alloc(cxlr);
+ if (IS_ERR(cxlr_dax))
+ return PTR_ERR(cxlr_dax);
+
+ dev = &cxlr_dax->dev;
+ rc = dev_set_name(dev, "dax_region%d", cxlr->id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+ dev_name(dev));
+
+ return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
+ cxlr_dax);
+err:
+ put_device(dev);
+ return rc;
+}
+
+static int match_decoder_by_range(struct device *dev, const void *data)
+{
+ const struct range *r1, *r2 = data;
+ struct cxl_decoder *cxld;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ r1 = &cxld->hpa_range;
+ return range_contains(r1, r2);
+}
+
+static struct cxl_decoder *
+cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa)
+{
+ struct device *cxld_dev = device_find_child(&port->dev, hpa,
+ match_decoder_by_range);
+
+ return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL;
+}
+
+static struct cxl_root_decoder *
+cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxled_to_port(cxled);
+ struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
+ struct cxl_decoder *root, *cxld = &cxled->cxld;
+ struct range *hpa = &cxld->hpa_range;
+
+ root = cxl_port_find_switch_decoder(&cxl_root->port, hpa);
+ if (!root) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s no CXL window for range %#llx:%#llx\n",
+ dev_name(&cxlmd->dev), dev_name(&cxld->dev),
+ cxld->hpa_range.start, cxld->hpa_range.end);
+ return NULL;
+ }
+
+ return to_cxl_root_decoder(&root->dev);
+}
+
+static int match_region_by_range(struct device *dev, const void *data)
+{
+ struct cxl_region_params *p;
+ struct cxl_region *cxlr;
+ const struct range *r = data;
+
+ if (!is_cxl_region(dev))
+ return 0;
+
+ cxlr = to_cxl_region(dev);
+ p = &cxlr->params;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ return spa_maps_hpa(p, r);
+}
+
+static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
+ struct resource *res)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ resource_size_t size = resource_size(res);
+ resource_size_t cache_size, start;
+
+ cache_size = cxlrd->cache_size;
+ if (!cache_size)
+ return 0;
+
+ if (size != cache_size) {
+ dev_warn(&cxlr->dev,
+ "Extended Linear Cache size %pa != CXL size %pa. No Support!",
+ &cache_size, &size);
+ return -ENXIO;
+ }
+
+ /*
+ * Move the start of the range to where the cache range starts. The
+ * implementation assumes that the cache range is in front of the
+ * CXL range. This is not dictated by the HMAT spec but is how the
+ * current known implementation is configured.
+ *
+ * The cache range is expected to be within the CFMWS. The adjusted
+ * res->start should not be less than cxlrd->res->start.
+ */
+ start = res->start - cache_size;
+ if (start < cxlrd->res->start)
+ return -ENXIO;
+
+ res->start = start;
+ p->cache_size = cache_size;
+
+ return 0;
+}
+
+static int __construct_region(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct range *hpa = &cxled->cxld.hpa_range;
+ struct cxl_region_params *p;
+ struct resource *res;
+ int rc;
+
+ guard(rwsem_write)(&cxl_rwsem.region);
+ p = &cxlr->params;
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s autodiscovery interrupted\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__);
+ return -EBUSY;
+ }
+
+ set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
+
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
+ dev_name(&cxlr->dev));
+
+ rc = cxl_extended_linear_cache_resize(cxlr, res);
+ if (rc && rc != -EOPNOTSUPP) {
+ /*
+ * Failing to support extended linear cache region resize does not
+ * prevent the region from functioning. Only causes cxl list showing
+ * incorrect region size.
+ */
+ dev_warn(cxlmd->dev.parent,
+ "Extended linear cache calculation failed rc:%d\n", rc);
+ }
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
+ if (rc)
+ return rc;
+
+ rc = insert_resource(cxlrd->res, res);
+ if (rc) {
+ /*
+ * Platform-firmware may not have split resources like "System
+ * RAM" on CXL window boundaries see cxl_region_iomem_release()
+ */
+ dev_warn(cxlmd->dev.parent,
+ "%s:%s: %s %s cannot insert resource\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, dev_name(&cxlr->dev));
+ }
+
+ p->res = res;
+ p->interleave_ways = cxled->cxld.interleave_ways;
+ p->interleave_granularity = cxled->cxld.interleave_granularity;
+ p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+ if (rc)
+ return rc;
+
+ dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
+ dev_name(&cxlr->dev), p->res, p->interleave_ways,
+ p->interleave_granularity);
+
+ /* ...to match put_device() in cxl_add_to_region() */
+ get_device(&cxlr->dev);
+
+ return 0;
+}
+
+/* Establish an empty region covering the given HPA range */
+static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxlrd_to_port(cxlrd);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ int rc, part = READ_ONCE(cxled->part);
+ struct cxl_region *cxlr;
+
+ do {
+ cxlr = __create_region(cxlrd, cxlds->part[part].mode,
+ atomic_read(&cxlrd->region_id));
+ } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
+
+ if (IS_ERR(cxlr)) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s failed assign region: %ld\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, PTR_ERR(cxlr));
+ return cxlr;
+ }
+
+ rc = __construct_region(cxlr, cxlrd, cxled);
+ if (rc) {
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
+ return ERR_PTR(rc);
+ }
+
+ return cxlr;
+}
+
+static struct cxl_region *
+cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa)
+{
+ struct device *region_dev;
+
+ region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa,
+ match_region_by_range);
+ if (!region_dev)
+ return NULL;
+
+ return to_cxl_region(region_dev);
+}
+
+int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
+{
+ struct range *hpa = &cxled->cxld.hpa_range;
+ struct cxl_region_params *p;
+ bool attach = false;
+ int rc;
+
+ struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) =
+ cxl_find_root_decoder(cxled);
+ if (!cxlrd)
+ return -ENXIO;
+
+ /*
+ * Ensure that if multiple threads race to construct_region() for @hpa
+ * one does the construction and the others add to that.
+ */
+ mutex_lock(&cxlrd->range_lock);
+ struct cxl_region *cxlr __free(put_cxl_region) =
+ cxl_find_region_by_range(cxlrd, hpa);
+ if (!cxlr)
+ cxlr = construct_region(cxlrd, cxled);
+ mutex_unlock(&cxlrd->range_lock);
+
+ rc = PTR_ERR_OR_ZERO(cxlr);
+ if (rc)
+ return rc;
+
+ attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
+
+ scoped_guard(rwsem_read, &cxl_rwsem.region) {
+ p = &cxlr->params;
+ attach = p->state == CXL_CONFIG_COMMIT;
+ }
+
+ if (attach) {
+ /*
+ * If device_attach() fails the range may still be active via
+ * the platform-firmware memory map, otherwise the driver for
+ * regions is local to this file, so driver matching can't fail.
+ */
+ if (device_attach(&cxlr->dev) < 0)
+ dev_err(&cxlr->dev, "failed to enable, range: %pr\n",
+ p->res);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
+
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
+{
+ struct cxl_region_ref *iter;
+ unsigned long index;
+
+ if (!endpoint)
+ return ~0ULL;
+
+ guard(rwsem_write)(&cxl_rwsem.region);
+
+ xa_for_each(&endpoint->regions, index, iter) {
+ struct cxl_region_params *p = &iter->region->params;
+
+ if (cxl_resource_contains_addr(p->res, spa)) {
+ if (!p->cache_size)
+ return ~0ULL;
+
+ if (spa >= p->res->start + p->cache_size)
+ return spa - p->cache_size;
+
+ return spa + p->cache_size;
+ }
+ }
+
+ return ~0ULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
+
+static int is_system_ram(struct resource *res, void *arg)
+{
+ struct cxl_region *cxlr = arg;
+ struct cxl_region_params *p = &cxlr->params;
+
+ dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res);
+ return 1;
+}
+
+static void shutdown_notifiers(void *_cxlr)
+{
+ struct cxl_region *cxlr = _cxlr;
+
+ unregister_node_notifier(&cxlr->node_notifier);
+ unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
+}
+
+static void remove_debugfs(void *dentry)
+{
+ debugfs_remove_recursive(dentry);
+}
+
+static int validate_region_offset(struct cxl_region *cxlr, u64 offset)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ resource_size_t region_size;
+ u64 hpa;
+
+ if (offset < p->cache_size) {
+ dev_err(&cxlr->dev,
+ "Offset %#llx is within extended linear cache %pa\n",
+ offset, &p->cache_size);
+ return -EINVAL;
+ }
+
+ region_size = resource_size(p->res);
+ if (offset >= region_size) {
+ dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n",
+ offset, &region_size);
+ return -EINVAL;
+ }
+
+ hpa = p->res->start + offset;
+ if (hpa < p->res->start || hpa > p->res->end) {
+ dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa,
+ p->res);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int cxl_region_debugfs_poison_inject(void *data, u64 offset)
+{
+ struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
+ struct cxl_region *cxlr = data;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ if (validate_region_offset(cxlr, offset))
+ return -EINVAL;
+
+ offset -= cxlr->params.cache_size;
+ rc = region_offset_to_dpa_result(cxlr, offset, &result);
+ if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
+ dev_dbg(&cxlr->dev,
+ "Failed to resolve DPA for region offset %#llx rc %d\n",
+ offset, rc);
+
+ return rc ? rc : -EINVAL;
+ }
+
+ return cxl_inject_poison_locked(result.cxlmd, result.dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
+ cxl_region_debugfs_poison_inject, "%llx\n");
+
+static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
+{
+ struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
+ struct cxl_region *cxlr = data;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ if (validate_region_offset(cxlr, offset))
+ return -EINVAL;
+
+ offset -= cxlr->params.cache_size;
+ rc = region_offset_to_dpa_result(cxlr, offset, &result);
+ if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
+ dev_dbg(&cxlr->dev,
+ "Failed to resolve DPA for region offset %#llx rc %d\n",
+ offset, rc);
+
+ return rc ? rc : -EINVAL;
+ }
+
+ return cxl_clear_poison_locked(result.cxlmd, result.dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
+ cxl_region_debugfs_poison_clear, "%llx\n");
+
+static int cxl_region_can_probe(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) {
+ dev_dbg(&cxlr->dev, "probe interrupted\n");
+ return rc;
+ }
+
+ if (p->state < CXL_CONFIG_COMMIT) {
+ dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
+ return -ENXIO;
+ }
+
+ if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
+ dev_err(&cxlr->dev,
+ "failed to activate, re-commit region and retry\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int cxl_region_probe(struct device *dev)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ bool poison_supported = true;
+ int rc;
+
+ rc = cxl_region_can_probe(cxlr);
+ if (rc)
+ return rc;
+
+ /*
+ * From this point on any path that changes the region's state away from
+ * CXL_CONFIG_COMMIT is also responsible for releasing the driver.
+ */
+
+ cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback;
+ cxlr->node_notifier.priority = CXL_CALLBACK_PRI;
+ register_node_notifier(&cxlr->node_notifier);
+
+ cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
+ cxlr->adist_notifier.priority = 100;
+ register_mt_adistance_algorithm(&cxlr->adist_notifier);
+
+ rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr);
+ if (rc)
+ return rc;
+
+ /* Create poison attributes if all memdevs support the capabilities */
+ for (int i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+ if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) ||
+ !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) {
+ poison_supported = false;
+ break;
+ }
+ }
+
+ if (poison_supported) {
+ struct dentry *dentry;
+
+ dentry = cxl_debugfs_create_dir(dev_name(dev));
+ debugfs_create_file("inject_poison", 0200, dentry, cxlr,
+ &cxl_poison_inject_fops);
+ debugfs_create_file("clear_poison", 0200, dentry, cxlr,
+ &cxl_poison_clear_fops);
+ rc = devm_add_action_or_reset(dev, remove_debugfs, dentry);
+ if (rc)
+ return rc;
+ }
+
+ switch (cxlr->mode) {
+ case CXL_PARTMODE_PMEM:
+ rc = devm_cxl_region_edac_register(cxlr);
+ if (rc)
+ dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
+ cxlr->id);
+
+ return devm_cxl_add_pmem_region(cxlr);
+ case CXL_PARTMODE_RAM:
+ rc = devm_cxl_region_edac_register(cxlr);
+ if (rc)
+ dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
+ cxlr->id);
+
+ /*
+ * The region can not be manged by CXL if any portion of
+ * it is already online as 'System RAM'
+ */
+ if (walk_iomem_res_desc(IORES_DESC_NONE,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ p->res->start, p->res->end, cxlr,
+ is_system_ram) > 0)
+ return 0;
+ return devm_cxl_add_dax_region(cxlr);
+ default:
+ dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
+ cxlr->mode);
+ return -ENXIO;
+ }
+}
+
+static struct cxl_driver cxl_region_driver = {
+ .name = "cxl_region",
+ .probe = cxl_region_probe,
+ .id = CXL_DEVICE_REGION,
+};
+
+int cxl_region_init(void)
+{
+ return cxl_driver_register(&cxl_region_driver);
+}
+
+void cxl_region_exit(void)
+{
+ cxl_driver_unregister(&cxl_region_driver);
+}
+
+MODULE_IMPORT_NS("CXL");
+MODULE_IMPORT_NS("DEVMEM");
+MODULE_ALIAS_CXL(CXL_DEVICE_REGION);
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
new file mode 100644
index 000000000000..5ca7b0eed568
--- /dev/null
+++ b/drivers/cxl/core/regs.c
@@ -0,0 +1,644 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2020 Intel Corporation. */
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <cxlmem.h>
+#include <cxlpci.h>
+#include <pmu.h>
+
+#include "core.h"
+
+/**
+ * DOC: cxl registers
+ *
+ * CXL device capabilities are enumerated by PCI DVSEC (Designated
+ * Vendor-specific) and / or descriptors provided by platform firmware.
+ * They can be defined as a set like the device and component registers
+ * mandated by CXL Section 8.1.12.2 Memory Device PCIe Capabilities and
+ * Extended Capabilities, or they can be individual capabilities
+ * appended to bridged and endpoint devices.
+ *
+ * Provide common infrastructure for enumerating and mapping these
+ * discrete capabilities.
+ */
+
+/**
+ * cxl_probe_component_regs() - Detect CXL Component register blocks
+ * @dev: Host device of the @base mapping
+ * @base: Mapping containing the HDM Decoder Capability Header
+ * @map: Map object describing the register block information found
+ *
+ * See CXL 2.0 8.2.4 Component Register Layout and Definition
+ * See CXL 2.0 8.2.5.5 CXL Device Register Interface
+ *
+ * Probe for component register information and return it in map object.
+ */
+void cxl_probe_component_regs(struct device *dev, void __iomem *base,
+ struct cxl_component_reg_map *map)
+{
+ int cap, cap_count;
+ u32 cap_array;
+
+ *map = (struct cxl_component_reg_map) { 0 };
+
+ /*
+ * CXL.cache and CXL.mem registers are at offset 0x1000 as defined in
+ * CXL 2.0 8.2.4 Table 141.
+ */
+ base += CXL_CM_OFFSET;
+
+ cap_array = readl(base + CXL_CM_CAP_HDR_OFFSET);
+
+ if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) {
+ dev_dbg(dev,
+ "Couldn't locate the CXL.cache and CXL.mem capability array header.\n");
+ return;
+ }
+
+ /* It's assumed that future versions will be backward compatible */
+ cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_array);
+
+ for (cap = 1; cap <= cap_count; cap++) {
+ void __iomem *register_block;
+ struct cxl_reg_map *rmap;
+ u16 cap_id, offset;
+ u32 length, hdr;
+
+ hdr = readl(base + cap * 0x4);
+
+ cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr);
+ offset = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr);
+ register_block = base + offset;
+ hdr = readl(register_block);
+
+ rmap = NULL;
+ switch (cap_id) {
+ case CXL_CM_CAP_CAP_ID_HDM: {
+ int decoder_cnt;
+
+ dev_dbg(dev, "found HDM decoder capability (0x%x)\n",
+ offset);
+
+ decoder_cnt = cxl_hdm_decoder_count(hdr);
+ length = 0x20 * decoder_cnt + 0x10;
+ rmap = &map->hdm_decoder;
+ break;
+ }
+ case CXL_CM_CAP_CAP_ID_RAS:
+ dev_dbg(dev, "found RAS capability (0x%x)\n",
+ offset);
+ length = CXL_RAS_CAPABILITY_LENGTH;
+ rmap = &map->ras;
+ break;
+ default:
+ dev_dbg(dev, "Unknown CM cap ID: %d (0x%x)\n", cap_id,
+ offset);
+ break;
+ }
+
+ if (!rmap)
+ continue;
+ rmap->valid = true;
+ rmap->id = cap_id;
+ rmap->offset = CXL_CM_OFFSET + offset;
+ rmap->size = length;
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_probe_component_regs, "CXL");
+
+/**
+ * cxl_probe_device_regs() - Detect CXL Device register blocks
+ * @dev: Host device of the @base mapping
+ * @base: Mapping of CXL 2.0 8.2.8 CXL Device Register Interface
+ * @map: Map object describing the register block information found
+ *
+ * Probe for device register information and return it in map object.
+ */
+void cxl_probe_device_regs(struct device *dev, void __iomem *base,
+ struct cxl_device_reg_map *map)
+{
+ int cap, cap_count;
+ u64 cap_array;
+
+ *map = (struct cxl_device_reg_map){ 0 };
+
+ cap_array = readq(base + CXLDEV_CAP_ARRAY_OFFSET);
+ if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) !=
+ CXLDEV_CAP_ARRAY_CAP_ID)
+ return;
+
+ cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array);
+
+ for (cap = 1; cap <= cap_count; cap++) {
+ struct cxl_reg_map *rmap;
+ u32 offset, length;
+ u16 cap_id;
+
+ cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK,
+ readl(base + cap * 0x10));
+ offset = readl(base + cap * 0x10 + 0x4);
+ length = readl(base + cap * 0x10 + 0x8);
+
+ rmap = NULL;
+ switch (cap_id) {
+ case CXLDEV_CAP_CAP_ID_DEVICE_STATUS:
+ dev_dbg(dev, "found Status capability (0x%x)\n", offset);
+ rmap = &map->status;
+ break;
+ case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX:
+ dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset);
+ rmap = &map->mbox;
+ break;
+ case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX:
+ dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset);
+ break;
+ case CXLDEV_CAP_CAP_ID_MEMDEV:
+ dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset);
+ rmap = &map->memdev;
+ break;
+ default:
+ if (cap_id >= 0x8000)
+ dev_dbg(dev, "Vendor cap ID: %#x offset: %#x\n", cap_id, offset);
+ else
+ dev_dbg(dev, "Unknown cap ID: %#x offset: %#x\n", cap_id, offset);
+ break;
+ }
+
+ if (!rmap)
+ continue;
+ rmap->valid = true;
+ rmap->id = cap_id;
+ rmap->offset = offset;
+ rmap->size = length;
+ }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_probe_device_regs, "CXL");
+
+void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
+ resource_size_t length)
+{
+ void __iomem *ret_val;
+ struct resource *res;
+
+ if (WARN_ON_ONCE(addr == CXL_RESOURCE_NONE))
+ return NULL;
+
+ res = devm_request_mem_region(dev, addr, length, dev_name(dev));
+ if (!res) {
+ resource_size_t end = addr + length - 1;
+
+ dev_err(dev, "Failed to request region %pa-%pa\n", &addr, &end);
+ return NULL;
+ }
+
+ ret_val = devm_ioremap(dev, addr, length);
+ if (!ret_val)
+ dev_err(dev, "Failed to map region %pr\n", res);
+
+ return ret_val;
+}
+
+int cxl_map_component_regs(const struct cxl_register_map *map,
+ struct cxl_component_regs *regs,
+ unsigned long map_mask)
+{
+ struct device *host = map->host;
+ struct mapinfo {
+ const struct cxl_reg_map *rmap;
+ void __iomem **addr;
+ } mapinfo[] = {
+ { &map->component_map.hdm_decoder, &regs->hdm_decoder },
+ { &map->component_map.ras, &regs->ras },
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mapinfo); i++) {
+ struct mapinfo *mi = &mapinfo[i];
+ resource_size_t addr;
+ resource_size_t length;
+
+ if (!mi->rmap->valid)
+ continue;
+ if (!test_bit(mi->rmap->id, &map_mask))
+ continue;
+ addr = map->resource + mi->rmap->offset;
+ length = mi->rmap->size;
+ *(mi->addr) = devm_cxl_iomap_block(host, addr, length);
+ if (!*(mi->addr))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_component_regs, "CXL");
+
+int cxl_map_device_regs(const struct cxl_register_map *map,
+ struct cxl_device_regs *regs)
+{
+ struct device *host = map->host;
+ resource_size_t phys_addr = map->resource;
+ struct mapinfo {
+ const struct cxl_reg_map *rmap;
+ void __iomem **addr;
+ } mapinfo[] = {
+ { &map->device_map.status, &regs->status, },
+ { &map->device_map.mbox, &regs->mbox, },
+ { &map->device_map.memdev, &regs->memdev, },
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mapinfo); i++) {
+ struct mapinfo *mi = &mapinfo[i];
+ resource_size_t length;
+ resource_size_t addr;
+
+ if (!mi->rmap->valid)
+ continue;
+
+ addr = phys_addr + mi->rmap->offset;
+ length = mi->rmap->size;
+ *(mi->addr) = devm_cxl_iomap_block(host, addr, length);
+ if (!*(mi->addr))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL");
+
+static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi,
+ struct cxl_register_map *map)
+{
+ u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo);
+ int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo);
+ u64 offset = ((u64)reg_hi << 32) |
+ (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK);
+
+ if (offset > pci_resource_len(pdev, bar)) {
+ dev_warn(&pdev->dev,
+ "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar,
+ &pdev->resource[bar], &offset, reg_type);
+ return false;
+ }
+
+ map->reg_type = reg_type;
+ map->resource = pci_resource_start(pdev, bar) + offset;
+ map->max_size = pci_resource_len(pdev, bar) - offset;
+ return true;
+}
+
+/*
+ * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index
+ * Use CXL_INSTANCES_COUNT for @index if counting instances.
+ *
+ * __cxl_find_regblock_instance() may return:
+ * 0 - if register block enumerated.
+ * >= 0 - if counting instances.
+ * < 0 - error code otherwise.
+ */
+static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map, int index)
+{
+ u32 regloc_size, regblocks;
+ int instance = 0;
+ int regloc, i;
+
+ *map = (struct cxl_register_map) {
+ .host = &pdev->dev,
+ .resource = CXL_RESOURCE_NONE,
+ };
+
+ regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
+ CXL_DVSEC_REG_LOCATOR);
+ if (!regloc)
+ return -ENXIO;
+
+ pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
+ regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
+
+ regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET;
+ regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8;
+
+ for (i = 0; i < regblocks; i++, regloc += 8) {
+ u32 reg_lo, reg_hi;
+
+ pci_read_config_dword(pdev, regloc, &reg_lo);
+ pci_read_config_dword(pdev, regloc + 4, &reg_hi);
+
+ if (!cxl_decode_regblock(pdev, reg_lo, reg_hi, map))
+ continue;
+
+ if (map->reg_type == type) {
+ if (index == instance)
+ return 0;
+ instance++;
+ }
+ }
+
+ map->resource = CXL_RESOURCE_NONE;
+ if (index == CXL_INSTANCES_COUNT)
+ return instance;
+
+ return -ENODEV;
+}
+
+/**
+ * cxl_find_regblock_instance() - Locate a register block by type / index
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ * @map: Enumeration output, clobbered on error
+ * @index: Index into which particular instance of a regblock wanted in the
+ * order found in register locator DVSEC.
+ *
+ * Return: 0 if register block enumerated, negative error code otherwise
+ *
+ * A CXL DVSEC may point to one or more register blocks, search for them
+ * by @type and @index.
+ */
+int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map, unsigned int index)
+{
+ return __cxl_find_regblock_instance(pdev, type, map, index);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_regblock_instance, "CXL");
+
+/**
+ * cxl_find_regblock() - Locate register blocks by type
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ * @map: Enumeration output, clobbered on error
+ *
+ * Return: 0 if register block enumerated, negative error code otherwise
+ *
+ * A CXL DVSEC may point to one or more register blocks, search for them
+ * by @type.
+ */
+int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map)
+{
+ return __cxl_find_regblock_instance(pdev, type, map, 0);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_find_regblock, "CXL");
+
+/**
+ * cxl_count_regblock() - Count instances of a given regblock type.
+ * @pdev: The CXL PCI device to enumerate.
+ * @type: Register Block Indicator id
+ *
+ * Some regblocks may be repeated. Count how many instances.
+ *
+ * Return: non-negative count of matching regblocks, negative error code otherwise.
+ */
+int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type)
+{
+ struct cxl_register_map map;
+
+ return __cxl_find_regblock_instance(pdev, type, &map, CXL_INSTANCES_COUNT);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_count_regblock, "CXL");
+
+int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs)
+{
+ struct device *dev = map->host;
+ resource_size_t phys_addr;
+
+ phys_addr = map->resource;
+ regs->pmu = devm_cxl_iomap_block(dev, phys_addr, CXL_PMU_REGMAP_SIZE);
+ if (!regs->pmu)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_map_pmu_regs, "CXL");
+
+static int cxl_map_regblock(struct cxl_register_map *map)
+{
+ struct device *host = map->host;
+
+ map->base = ioremap(map->resource, map->max_size);
+ if (!map->base) {
+ dev_err(host, "failed to map registers\n");
+ return -ENOMEM;
+ }
+
+ dev_dbg(host, "Mapped CXL Memory Device resource %pa\n", &map->resource);
+ return 0;
+}
+
+static void cxl_unmap_regblock(struct cxl_register_map *map)
+{
+ iounmap(map->base);
+ map->base = NULL;
+}
+
+static int cxl_probe_regs(struct cxl_register_map *map)
+{
+ struct cxl_component_reg_map *comp_map;
+ struct cxl_device_reg_map *dev_map;
+ struct device *host = map->host;
+ void __iomem *base = map->base;
+
+ switch (map->reg_type) {
+ case CXL_REGLOC_RBI_COMPONENT:
+ comp_map = &map->component_map;
+ cxl_probe_component_regs(host, base, comp_map);
+ dev_dbg(host, "Set up component registers\n");
+ break;
+ case CXL_REGLOC_RBI_MEMDEV:
+ dev_map = &map->device_map;
+ cxl_probe_device_regs(host, base, dev_map);
+ if (!dev_map->status.valid || !dev_map->mbox.valid ||
+ !dev_map->memdev.valid) {
+ dev_err(host, "registers not found: %s%s%s\n",
+ !dev_map->status.valid ? "status " : "",
+ !dev_map->mbox.valid ? "mbox " : "",
+ !dev_map->memdev.valid ? "memdev " : "");
+ return -ENXIO;
+ }
+
+ dev_dbg(host, "Probing device registers...\n");
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int cxl_setup_regs(struct cxl_register_map *map)
+{
+ int rc;
+
+ rc = cxl_map_regblock(map);
+ if (rc)
+ return rc;
+
+ rc = cxl_probe_regs(map);
+ cxl_unmap_regblock(map);
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_setup_regs, "CXL");
+
+u16 cxl_rcrb_to_aer(struct device *dev, resource_size_t rcrb)
+{
+ void __iomem *addr;
+ u16 offset = 0;
+ u32 cap_hdr;
+
+ if (WARN_ON_ONCE(rcrb == CXL_RESOURCE_NONE))
+ return 0;
+
+ if (!request_mem_region(rcrb, SZ_4K, dev_name(dev)))
+ return 0;
+
+ addr = ioremap(rcrb, SZ_4K);
+ if (!addr)
+ goto out;
+
+ cap_hdr = readl(addr + offset);
+ while (PCI_EXT_CAP_ID(cap_hdr) != PCI_EXT_CAP_ID_ERR) {
+ offset = PCI_EXT_CAP_NEXT(cap_hdr);
+
+ /* Offset 0 terminates capability list. */
+ if (!offset)
+ break;
+ cap_hdr = readl(addr + offset);
+ }
+
+ if (offset)
+ dev_dbg(dev, "found AER extended capability (0x%x)\n", offset);
+
+ iounmap(addr);
+out:
+ release_mem_region(rcrb, SZ_4K);
+
+ return offset;
+}
+
+static resource_size_t cxl_rcrb_to_linkcap(struct device *dev, struct cxl_dport *dport)
+{
+ resource_size_t rcrb = dport->rcrb.base;
+ void __iomem *addr;
+ u32 cap_hdr;
+ u16 offset;
+
+ if (!request_mem_region(rcrb, SZ_4K, "CXL RCRB"))
+ return CXL_RESOURCE_NONE;
+
+ addr = ioremap(rcrb, SZ_4K);
+ if (!addr) {
+ dev_err(dev, "Failed to map region %pr\n", addr);
+ release_mem_region(rcrb, SZ_4K);
+ return CXL_RESOURCE_NONE;
+ }
+
+ offset = FIELD_GET(PCI_RCRB_CAP_LIST_ID_MASK, readw(addr + PCI_CAPABILITY_LIST));
+ cap_hdr = readl(addr + offset);
+ while ((FIELD_GET(PCI_RCRB_CAP_HDR_ID_MASK, cap_hdr)) != PCI_CAP_ID_EXP) {
+ offset = FIELD_GET(PCI_RCRB_CAP_HDR_NEXT_MASK, cap_hdr);
+ if (offset == 0 || offset > SZ_4K) {
+ offset = 0;
+ break;
+ }
+ cap_hdr = readl(addr + offset);
+ }
+
+ iounmap(addr);
+ release_mem_region(rcrb, SZ_4K);
+ if (!offset)
+ return CXL_RESOURCE_NONE;
+
+ return offset;
+}
+
+int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport)
+{
+ void __iomem *dport_pcie_cap = NULL;
+ resource_size_t pos;
+ struct cxl_rcrb_info *ri;
+
+ ri = &dport->rcrb;
+ pos = cxl_rcrb_to_linkcap(&pdev->dev, dport);
+ if (pos == CXL_RESOURCE_NONE)
+ return -ENXIO;
+
+ dport_pcie_cap = devm_cxl_iomap_block(&pdev->dev,
+ ri->base + pos,
+ PCI_CAP_EXP_SIZEOF);
+ dport->regs.rcd_pcie_cap = dport_pcie_cap;
+
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_dport_map_rcd_linkcap, "CXL");
+
+resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri,
+ enum cxl_rcrb which)
+{
+ resource_size_t component_reg_phys;
+ resource_size_t rcrb = ri->base;
+ void __iomem *addr;
+ u32 bar0, bar1;
+ u32 id;
+
+ if (which == CXL_RCRB_UPSTREAM)
+ rcrb += SZ_4K;
+
+ /*
+ * RCRB's BAR[0..1] point to component block containing CXL
+ * subsystem component registers. MEMBAR extraction follows
+ * the PCI Base spec here, esp. 64 bit extraction and memory
+ * ranges alignment (6.0, 7.5.1.2.1).
+ */
+ if (!request_mem_region(rcrb, SZ_4K, "CXL RCRB"))
+ return CXL_RESOURCE_NONE;
+ addr = ioremap(rcrb, SZ_4K);
+ if (!addr) {
+ dev_err(dev, "Failed to map region %pr\n", addr);
+ release_mem_region(rcrb, SZ_4K);
+ return CXL_RESOURCE_NONE;
+ }
+
+ id = readl(addr + PCI_VENDOR_ID);
+ bar0 = readl(addr + PCI_BASE_ADDRESS_0);
+ bar1 = readl(addr + PCI_BASE_ADDRESS_1);
+ iounmap(addr);
+ release_mem_region(rcrb, SZ_4K);
+
+ /*
+ * Sanity check, see CXL 3.0 Figure 9-8 CXL Device that Does Not
+ * Remap Upstream Port and Component Registers
+ */
+ if (id == U32_MAX) {
+ if (which == CXL_RCRB_DOWNSTREAM)
+ dev_err(dev, "Failed to access Downstream Port RCRB\n");
+ return CXL_RESOURCE_NONE;
+ }
+ /* The RCRB is a Memory Window, and the MEM_TYPE_1M bit is obsolete */
+ if (bar0 & (PCI_BASE_ADDRESS_MEM_TYPE_1M | PCI_BASE_ADDRESS_SPACE_IO))
+ return CXL_RESOURCE_NONE;
+
+ component_reg_phys = bar0 & PCI_BASE_ADDRESS_MEM_MASK;
+ if (bar0 & PCI_BASE_ADDRESS_MEM_TYPE_64)
+ component_reg_phys |= ((u64)bar1) << 32;
+
+ if (!component_reg_phys)
+ return CXL_RESOURCE_NONE;
+
+ /* MEMBAR is block size (64k) aligned. */
+ if (!IS_ALIGNED(component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE))
+ return CXL_RESOURCE_NONE;
+
+ return component_reg_phys;
+}
+
+resource_size_t cxl_rcd_component_reg_phys(struct device *dev,
+ struct cxl_dport *dport)
+{
+ if (!dport->rch)
+ return CXL_RESOURCE_NONE;
+ return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL");
diff --git a/drivers/cxl/core/suspend.c b/drivers/cxl/core/suspend.c
new file mode 100644
index 000000000000..29aa5cc5e565
--- /dev/null
+++ b/drivers/cxl/core/suspend.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/atomic.h>
+#include <linux/export.h>
+#include "cxlmem.h"
+
+static atomic_t mem_active;
+
+bool cxl_mem_active(void)
+{
+ return atomic_read(&mem_active) != 0;
+}
+
+void cxl_mem_active_inc(void)
+{
+ atomic_inc(&mem_active);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL");
+
+void cxl_mem_active_dec(void)
+{
+ atomic_dec(&mem_active);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_active_dec, "CXL");
diff --git a/drivers/cxl/core/trace.c b/drivers/cxl/core/trace.c
new file mode 100644
index 000000000000..7f2a9dd0d0e3
--- /dev/null
+++ b/drivers/cxl/core/trace.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+
+#include <cxl.h>
+#include "core.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
new file mode 100644
index 000000000000..a972e4ef1936
--- /dev/null
+++ b/drivers/cxl/core/trace.h
@@ -0,0 +1,1105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cxl
+
+#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _CXL_EVENTS_H
+
+#include <linux/tracepoint.h>
+#include <linux/pci.h>
+#include <linux/unaligned.h>
+
+#include <cxl.h>
+#include <cxlmem.h>
+#include "core.h"
+
+#define CXL_RAS_UC_CACHE_DATA_PARITY BIT(0)
+#define CXL_RAS_UC_CACHE_ADDR_PARITY BIT(1)
+#define CXL_RAS_UC_CACHE_BE_PARITY BIT(2)
+#define CXL_RAS_UC_CACHE_DATA_ECC BIT(3)
+#define CXL_RAS_UC_MEM_DATA_PARITY BIT(4)
+#define CXL_RAS_UC_MEM_ADDR_PARITY BIT(5)
+#define CXL_RAS_UC_MEM_BE_PARITY BIT(6)
+#define CXL_RAS_UC_MEM_DATA_ECC BIT(7)
+#define CXL_RAS_UC_REINIT_THRESH BIT(8)
+#define CXL_RAS_UC_RSVD_ENCODE BIT(9)
+#define CXL_RAS_UC_POISON BIT(10)
+#define CXL_RAS_UC_RECV_OVERFLOW BIT(11)
+#define CXL_RAS_UC_INTERNAL_ERR BIT(14)
+#define CXL_RAS_UC_IDE_TX_ERR BIT(15)
+#define CXL_RAS_UC_IDE_RX_ERR BIT(16)
+
+#define show_uc_errs(status) __print_flags(status, " | ", \
+ { CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" }, \
+ { CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" }, \
+ { CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \
+ { CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" }, \
+ { CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" }, \
+ { CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" }, \
+ { CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" }, \
+ { CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" }, \
+ { CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" }, \
+ { CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" }, \
+ { CXL_RAS_UC_POISON, "Received Poison From Peer" }, \
+ { CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" }, \
+ { CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" }, \
+ { CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" }, \
+ { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \
+)
+
+TRACE_EVENT(cxl_port_aer_uncorrectable_error,
+ TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(dev, status, fe, hl),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ __field(u32, first_error)
+ __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ __entry->first_error = fe;
+ /*
+ * Embed the 512B headerlog data for user app retrieval and
+ * parsing, but no need to print this in the trace buffer.
+ */
+ memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+ ),
+ TP_printk("device=%s host=%s status: '%s' first_error: '%s'",
+ __get_str(device), __get_str(host),
+ show_uc_errs(__entry->status),
+ show_uc_errs(__entry->first_error)
+ )
+);
+
+TRACE_EVENT(cxl_aer_uncorrectable_error,
+ TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl),
+ TP_ARGS(cxlmd, status, fe, hl),
+ TP_STRUCT__entry(
+ __string(memdev, dev_name(&cxlmd->dev))
+ __string(host, dev_name(cxlmd->dev.parent))
+ __field(u64, serial)
+ __field(u32, status)
+ __field(u32, first_error)
+ __array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
+ ),
+ TP_fast_assign(
+ __assign_str(memdev);
+ __assign_str(host);
+ __entry->serial = cxlmd->cxlds->serial;
+ __entry->status = status;
+ __entry->first_error = fe;
+ /*
+ * Embed the 512B headerlog data for user app retrieval and
+ * parsing, but no need to print this in the trace buffer.
+ */
+ memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE);
+ ),
+ TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'",
+ __get_str(memdev), __get_str(host), __entry->serial,
+ show_uc_errs(__entry->status),
+ show_uc_errs(__entry->first_error)
+ )
+);
+
+#define CXL_RAS_CE_CACHE_DATA_ECC BIT(0)
+#define CXL_RAS_CE_MEM_DATA_ECC BIT(1)
+#define CXL_RAS_CE_CRC_THRESH BIT(2)
+#define CLX_RAS_CE_RETRY_THRESH BIT(3)
+#define CXL_RAS_CE_CACHE_POISON BIT(4)
+#define CXL_RAS_CE_MEM_POISON BIT(5)
+#define CXL_RAS_CE_PHYS_LAYER_ERR BIT(6)
+
+#define show_ce_errs(status) __print_flags(status, " | ", \
+ { CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" }, \
+ { CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" }, \
+ { CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" }, \
+ { CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" }, \
+ { CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" }, \
+ { CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" }, \
+ { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \
+)
+
+TRACE_EVENT(cxl_port_aer_correctable_error,
+ TP_PROTO(struct device *dev, u32 status),
+ TP_ARGS(dev, status),
+ TP_STRUCT__entry(
+ __string(device, dev_name(dev))
+ __string(host, dev_name(dev->parent))
+ __field(u32, status)
+ ),
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(host);
+ __entry->status = status;
+ ),
+ TP_printk("device=%s host=%s status='%s'",
+ __get_str(device), __get_str(host),
+ show_ce_errs(__entry->status)
+ )
+);
+
+TRACE_EVENT(cxl_aer_correctable_error,
+ TP_PROTO(const struct cxl_memdev *cxlmd, u32 status),
+ TP_ARGS(cxlmd, status),
+ TP_STRUCT__entry(
+ __string(memdev, dev_name(&cxlmd->dev))
+ __string(host, dev_name(cxlmd->dev.parent))
+ __field(u64, serial)
+ __field(u32, status)
+ ),
+ TP_fast_assign(
+ __assign_str(memdev);
+ __assign_str(host);
+ __entry->serial = cxlmd->cxlds->serial;
+ __entry->status = status;
+ ),
+ TP_printk("memdev=%s host=%s serial=%lld: status: '%s'",
+ __get_str(memdev), __get_str(host), __entry->serial,
+ show_ce_errs(__entry->status)
+ )
+);
+
+#define cxl_event_log_type_str(type) \
+ __print_symbolic(type, \
+ { CXL_EVENT_TYPE_INFO, "Informational" }, \
+ { CXL_EVENT_TYPE_WARN, "Warning" }, \
+ { CXL_EVENT_TYPE_FAIL, "Failure" }, \
+ { CXL_EVENT_TYPE_FATAL, "Fatal" })
+
+TRACE_EVENT(cxl_overflow,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ struct cxl_get_event_payload *payload),
+
+ TP_ARGS(cxlmd, log, payload),
+
+ TP_STRUCT__entry(
+ __string(memdev, dev_name(&cxlmd->dev))
+ __string(host, dev_name(cxlmd->dev.parent))
+ __field(int, log)
+ __field(u64, serial)
+ __field(u64, first_ts)
+ __field(u64, last_ts)
+ __field(u16, count)
+ ),
+
+ TP_fast_assign(
+ __assign_str(memdev);
+ __assign_str(host);
+ __entry->serial = cxlmd->cxlds->serial;
+ __entry->log = log;
+ __entry->count = le16_to_cpu(payload->overflow_err_count);
+ __entry->first_ts = le64_to_cpu(payload->first_overflow_timestamp);
+ __entry->last_ts = le64_to_cpu(payload->last_overflow_timestamp);
+ ),
+
+ TP_printk("memdev=%s host=%s serial=%lld: log=%s : %u records from %llu to %llu",
+ __get_str(memdev), __get_str(host), __entry->serial,
+ cxl_event_log_type_str(__entry->log), __entry->count,
+ __entry->first_ts, __entry->last_ts)
+
+);
+
+/*
+ * Common Event Record Format
+ * CXL 3.0 section 8.2.9.2.1; Table 8-42
+ */
+#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2)
+#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3)
+#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4)
+#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5)
+#define CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID BIT(6)
+#define CXL_EVENT_RECORD_FLAG_LD_ID_VALID BIT(7)
+#define CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID BIT(8)
+#define show_hdr_flags(flags) __print_flags(flags, " | ", \
+ { CXL_EVENT_RECORD_FLAG_PERMANENT, "PERMANENT_CONDITION" }, \
+ { CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, "MAINTENANCE_NEEDED" }, \
+ { CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, "PERFORMANCE_DEGRADED" }, \
+ { CXL_EVENT_RECORD_FLAG_HW_REPLACE, "HARDWARE_REPLACEMENT_NEEDED" }, \
+ { CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID, "MAINT_OP_SUB_CLASS_VALID" }, \
+ { CXL_EVENT_RECORD_FLAG_LD_ID_VALID, "LD_ID_VALID" }, \
+ { CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID, "HEAD_ID_VALID" } \
+)
+
+/*
+ * Define macros for the common header of each CXL event.
+ *
+ * Tracepoints using these macros must do 3 things:
+ *
+ * 1) Add CXL_EVT_TP_entry to TP_STRUCT__entry
+ * 2) Use CXL_EVT_TP_fast_assign within TP_fast_assign;
+ * pass the dev, log, and CXL event header
+ * NOTE: The uuid must be assigned by the specific trace event
+ * 3) Use CXL_EVT_TP_printk() instead of TP_printk()
+ *
+ * See the generic_event tracepoint as an example.
+ */
+#define CXL_EVT_TP_entry \
+ __string(memdev, dev_name(&cxlmd->dev)) \
+ __string(host, dev_name(cxlmd->dev.parent)) \
+ __field(int, log) \
+ __field_struct(uuid_t, hdr_uuid) \
+ __field(u64, serial) \
+ __field(u32, hdr_flags) \
+ __field(u16, hdr_handle) \
+ __field(u16, hdr_related_handle) \
+ __field(u64, hdr_timestamp) \
+ __field(u8, hdr_length) \
+ __field(u8, hdr_maint_op_class) \
+ __field(u8, hdr_maint_op_sub_class) \
+ __field(u16, hdr_ld_id) \
+ __field(u8, hdr_head_id)
+
+#define CXL_EVT_TP_fast_assign(cxlmd, l, hdr) \
+ __assign_str(memdev); \
+ __assign_str(host); \
+ __entry->log = (l); \
+ __entry->serial = (cxlmd)->cxlds->serial; \
+ __entry->hdr_length = (hdr).length; \
+ __entry->hdr_flags = get_unaligned_le24((hdr).flags); \
+ __entry->hdr_handle = le16_to_cpu((hdr).handle); \
+ __entry->hdr_related_handle = le16_to_cpu((hdr).related_handle); \
+ __entry->hdr_timestamp = le64_to_cpu((hdr).timestamp); \
+ __entry->hdr_maint_op_class = (hdr).maint_op_class; \
+ __entry->hdr_maint_op_sub_class = (hdr).maint_op_sub_class; \
+ __entry->hdr_ld_id = le16_to_cpu((hdr).ld_id); \
+ __entry->hdr_head_id = (hdr).head_id
+
+#define CXL_EVT_TP_printk(fmt, ...) \
+ TP_printk("memdev=%s host=%s serial=%lld log=%s : time=%llu uuid=%pUb " \
+ "len=%d flags='%s' handle=%x related_handle=%x " \
+ "maint_op_class=%u maint_op_sub_class=%u " \
+ "ld_id=%x head_id=%x : " fmt, \
+ __get_str(memdev), __get_str(host), __entry->serial, \
+ cxl_event_log_type_str(__entry->log), \
+ __entry->hdr_timestamp, &__entry->hdr_uuid, __entry->hdr_length,\
+ show_hdr_flags(__entry->hdr_flags), __entry->hdr_handle, \
+ __entry->hdr_related_handle, __entry->hdr_maint_op_class, \
+ __entry->hdr_maint_op_sub_class, \
+ __entry->hdr_ld_id, __entry->hdr_head_id, \
+ ##__VA_ARGS__)
+
+TRACE_EVENT(cxl_generic_event,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ const uuid_t *uuid, struct cxl_event_generic *gen_rec),
+
+ TP_ARGS(cxlmd, log, uuid, gen_rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+ __array(u8, data, CXL_EVENT_RECORD_DATA_LENGTH)
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(cxlmd, log, gen_rec->hdr);
+ memcpy(&__entry->hdr_uuid, uuid, sizeof(uuid_t));
+ memcpy(__entry->data, gen_rec->data, CXL_EVENT_RECORD_DATA_LENGTH);
+ ),
+
+ CXL_EVT_TP_printk("%s",
+ __print_hex(__entry->data, CXL_EVENT_RECORD_DATA_LENGTH))
+);
+
+/*
+ * Physical Address field masks
+ *
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ *
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CXL_DPA_FLAGS_MASK GENMASK(1, 0)
+#define CXL_DPA_MASK GENMASK_ULL(63, 6)
+
+#define CXL_DPA_VOLATILE BIT(0)
+#define CXL_DPA_NOT_REPAIRABLE BIT(1)
+#define show_dpa_flags(flags) __print_flags(flags, "|", \
+ { CXL_DPA_VOLATILE, "VOLATILE" }, \
+ { CXL_DPA_NOT_REPAIRABLE, "NOT_REPAIRABLE" } \
+)
+
+/*
+ * Component ID Format
+ * CXL 3.1 section 8.2.9.2.1; Table 8-44
+ */
+#define CXL_PLDM_COMPONENT_ID_ENTITY_VALID BIT(0)
+#define CXL_PLDM_COMPONENT_ID_RES_VALID BIT(1)
+
+#define show_comp_id_pldm_flags(flags) __print_flags(flags, " | ", \
+ { CXL_PLDM_COMPONENT_ID_ENTITY_VALID, "PLDM Entity ID" }, \
+ { CXL_PLDM_COMPONENT_ID_RES_VALID, "Resource ID" } \
+)
+
+#define show_pldm_entity_id(flags, valid_comp_id, valid_id_format, comp_id) \
+ (flags & valid_comp_id && flags & valid_id_format) ? \
+ (comp_id[0] & CXL_PLDM_COMPONENT_ID_ENTITY_VALID) ? \
+ __print_hex(&comp_id[1], 6) : "0x00" : "0x00"
+
+#define show_pldm_resource_id(flags, valid_comp_id, valid_id_format, comp_id) \
+ (flags & valid_comp_id && flags & valid_id_format) ? \
+ (comp_id[0] & CXL_PLDM_COMPONENT_ID_RES_VALID) ? \
+ __print_hex(&comp_id[7], 4) : "0x00" : "0x00"
+
+/*
+ * General Media Event Record - GMER
+ * CXL rev 3.1 Section 8.2.9.2.1.1; Table 8-45
+ */
+#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0)
+#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1)
+#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2)
+#define show_event_desc_flags(flags) __print_flags(flags, "|", \
+ { CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, "UNCORRECTABLE_EVENT" }, \
+ { CXL_GMER_EVT_DESC_THRESHOLD_EVENT, "THRESHOLD_EVENT" }, \
+ { CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, "POISON_LIST_OVERFLOW" } \
+)
+
+#define CXL_GMER_MEM_EVT_TYPE_ECC_ERROR 0x00
+#define CXL_GMER_MEM_EVT_TYPE_INV_ADDR 0x01
+#define CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR 0x02
+#define CXL_GMER_MEM_EVT_TYPE_TE_STATE_VIOLATION 0x03
+#define CXL_GMER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR 0x04
+#define CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE 0x05
+#define CXL_GMER_MEM_EVT_TYPE_CKID_VIOLATION 0x06
+#define show_gmer_mem_event_type(type) __print_symbolic(type, \
+ { CXL_GMER_MEM_EVT_TYPE_ECC_ERROR, "ECC Error" }, \
+ { CXL_GMER_MEM_EVT_TYPE_INV_ADDR, "Invalid Address" }, \
+ { CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, "Data Path Error" }, \
+ { CXL_GMER_MEM_EVT_TYPE_TE_STATE_VIOLATION, "TE State Violation" }, \
+ { CXL_GMER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR, "Scrub Media ECC Error" }, \
+ { CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE, "Adv Prog CME Counter Expiration" }, \
+ { CXL_GMER_MEM_EVT_TYPE_CKID_VIOLATION, "CKID Violation" } \
+)
+
+#define CXL_GMER_TRANS_UNKNOWN 0x00
+#define CXL_GMER_TRANS_HOST_READ 0x01
+#define CXL_GMER_TRANS_HOST_WRITE 0x02
+#define CXL_GMER_TRANS_HOST_SCAN_MEDIA 0x03
+#define CXL_GMER_TRANS_HOST_INJECT_POISON 0x04
+#define CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB 0x05
+#define CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT 0x06
+#define CXL_GMER_TRANS_INTERNAL_MEDIA_ECS 0x07
+#define CXL_GMER_TRANS_MEDIA_INITIALIZATION 0x08
+#define show_trans_type(type) __print_symbolic(type, \
+ { CXL_GMER_TRANS_UNKNOWN, "Unknown" }, \
+ { CXL_GMER_TRANS_HOST_READ, "Host Read" }, \
+ { CXL_GMER_TRANS_HOST_WRITE, "Host Write" }, \
+ { CXL_GMER_TRANS_HOST_SCAN_MEDIA, "Host Scan Media" }, \
+ { CXL_GMER_TRANS_HOST_INJECT_POISON, "Host Inject Poison" }, \
+ { CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, "Internal Media Scrub" }, \
+ { CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT, "Internal Media Management" }, \
+ { CXL_GMER_TRANS_INTERNAL_MEDIA_ECS, "Internal Media Error Check Scrub" }, \
+ { CXL_GMER_TRANS_MEDIA_INITIALIZATION, "Media Initialization" } \
+)
+
+#define CXL_GMER_VALID_CHANNEL BIT(0)
+#define CXL_GMER_VALID_RANK BIT(1)
+#define CXL_GMER_VALID_DEVICE BIT(2)
+#define CXL_GMER_VALID_COMPONENT BIT(3)
+#define CXL_GMER_VALID_COMPONENT_ID_FORMAT BIT(4)
+#define show_valid_flags(flags) __print_flags(flags, "|", \
+ { CXL_GMER_VALID_CHANNEL, "CHANNEL" }, \
+ { CXL_GMER_VALID_RANK, "RANK" }, \
+ { CXL_GMER_VALID_DEVICE, "DEVICE" }, \
+ { CXL_GMER_VALID_COMPONENT, "COMPONENT" }, \
+ { CXL_GMER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" } \
+)
+
+#define CXL_GMER_CME_EV_FLAG_CME_MULTIPLE_MEDIA BIT(0)
+#define CXL_GMER_CME_EV_FLAG_THRESHOLD_EXCEEDED BIT(1)
+#define show_cme_threshold_ev_flags(flags) __print_flags(flags, "|", \
+ { \
+ CXL_GMER_CME_EV_FLAG_CME_MULTIPLE_MEDIA, \
+ "Corrected Memory Errors in Multiple Media Components" \
+ }, { \
+ CXL_GMER_CME_EV_FLAG_THRESHOLD_EXCEEDED, \
+ "Exceeded Programmable Threshold" \
+ } \
+)
+
+#define CXL_GMER_MEM_EVT_SUB_TYPE_NOT_REPORTED 0x00
+#define CXL_GMER_MEM_EVT_SUB_TYPE_INTERNAL_DATAPATH_ERROR 0x01
+#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_COMMAND_TRAINING_ERROR 0x02
+#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CONTROL_TRAINING_ERROR 0x03
+#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_DATA_TRAINING_ERROR 0x04
+#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CRC_ERROR 0x05
+#define show_mem_event_sub_type(sub_type) __print_symbolic(sub_type, \
+ { CXL_GMER_MEM_EVT_SUB_TYPE_NOT_REPORTED, "Not Reported" }, \
+ { CXL_GMER_MEM_EVT_SUB_TYPE_INTERNAL_DATAPATH_ERROR, "Internal Datapath Error" }, \
+ { \
+ CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_COMMAND_TRAINING_ERROR, \
+ "Media Link Command Training Error" \
+ }, { \
+ CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CONTROL_TRAINING_ERROR, \
+ "Media Link Control Training Error" \
+ }, { \
+ CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_DATA_TRAINING_ERROR, \
+ "Media Link Data Training Error" \
+ }, { \
+ CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CRC_ERROR, "Media Link CRC Error" \
+ } \
+)
+
+TRACE_EVENT(cxl_general_media,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_gen_media *rec),
+
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+ /* General Media */
+ __field(u64, dpa)
+ __field(u8, descriptor)
+ __field(u8, type)
+ __field(u8, transaction_type)
+ __field(u8, channel)
+ __field(u32, device)
+ __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+ /* Following are out of order to pack trace record */
+ __field(u64, hpa)
+ __field(u64, hpa_alias0)
+ __field_struct(uuid_t, region_uuid)
+ __field(u16, validity_flags)
+ __field(u8, rank)
+ __field(u8, dpa_flags)
+ __field(u32, cme_count)
+ __field(u8, sub_type)
+ __field(u8, cme_threshold_ev_flags)
+ __string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr);
+ __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID;
+
+ /* General Media */
+ __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr);
+ __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
+ /* Mask after flags have been parsed */
+ __entry->dpa &= CXL_DPA_MASK;
+ __entry->descriptor = rec->media_hdr.descriptor;
+ __entry->type = rec->media_hdr.type;
+ __entry->sub_type = rec->sub_type;
+ __entry->transaction_type = rec->media_hdr.transaction_type;
+ __entry->channel = rec->media_hdr.channel;
+ __entry->rank = rec->media_hdr.rank;
+ __entry->device = get_unaligned_le24(rec->device);
+ memcpy(__entry->comp_id, &rec->component_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+ __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags);
+ __entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
+ if (cxlr) {
+ __assign_str(region_name);
+ uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+ } else {
+ __assign_str(region_name);
+ uuid_copy(&__entry->region_uuid, &uuid_null);
+ }
+ __entry->cme_threshold_ev_flags = rec->cme_threshold_ev_flags;
+ if (rec->media_hdr.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT)
+ __entry->cme_count = get_unaligned_le24(rec->cme_count);
+ else
+ __entry->cme_count = 0;
+ ),
+
+ CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' " \
+ "descriptor='%s' type='%s' sub_type='%s' " \
+ "transaction_type='%s' channel=%u rank=%u " \
+ "device=%x validity_flags='%s' " \
+ "comp_id=%s comp_id_pldm_valid_flags='%s' " \
+ "pldm_entity_id=%s pldm_resource_id=%s " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
+ "cme_threshold_ev_flags='%s' cme_count=%u",
+ __entry->dpa, show_dpa_flags(__entry->dpa_flags),
+ show_event_desc_flags(__entry->descriptor),
+ show_gmer_mem_event_type(__entry->type),
+ show_mem_event_sub_type(__entry->sub_type),
+ show_trans_type(__entry->transaction_type),
+ __entry->channel, __entry->rank, __entry->device,
+ show_valid_flags(__entry->validity_flags),
+ __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+ show_comp_id_pldm_flags(__entry->comp_id[0]),
+ show_pldm_entity_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT,
+ CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT,
+ CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
+ show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count
+ )
+);
+
+/*
+ * DRAM Event Record - DER
+ *
+ * CXL rev 3.1 section 8.2.9.2.1.2; Table 8-46
+ */
+/*
+ * DRAM Event Record defines many fields the same as the General Media Event
+ * Record. Reuse those definitions as appropriate.
+ */
+#define CXL_DER_MEM_EVT_TYPE_ECC_ERROR 0x00
+#define CXL_DER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR 0x01
+#define CXL_DER_MEM_EVT_TYPE_INV_ADDR 0x02
+#define CXL_DER_MEM_EVT_TYPE_DATA_PATH_ERROR 0x03
+#define CXL_DER_MEM_EVT_TYPE_TE_STATE_VIOLATION 0x04
+#define CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE 0x05
+#define CXL_DER_MEM_EVT_TYPE_CKID_VIOLATION 0x06
+#define show_dram_mem_event_type(type) __print_symbolic(type, \
+ { CXL_DER_MEM_EVT_TYPE_ECC_ERROR, "ECC Error" }, \
+ { CXL_DER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR, "Scrub Media ECC Error" }, \
+ { CXL_DER_MEM_EVT_TYPE_INV_ADDR, "Invalid Address" }, \
+ { CXL_DER_MEM_EVT_TYPE_DATA_PATH_ERROR, "Data Path Error" }, \
+ { CXL_DER_MEM_EVT_TYPE_TE_STATE_VIOLATION, "TE State Violation" }, \
+ { CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE, "Adv Prog CME Counter Expiration" }, \
+ { CXL_DER_MEM_EVT_TYPE_CKID_VIOLATION, "CKID Violation" } \
+)
+
+#define CXL_DER_VALID_CHANNEL BIT(0)
+#define CXL_DER_VALID_RANK BIT(1)
+#define CXL_DER_VALID_NIBBLE BIT(2)
+#define CXL_DER_VALID_BANK_GROUP BIT(3)
+#define CXL_DER_VALID_BANK BIT(4)
+#define CXL_DER_VALID_ROW BIT(5)
+#define CXL_DER_VALID_COLUMN BIT(6)
+#define CXL_DER_VALID_CORRECTION_MASK BIT(7)
+#define CXL_DER_VALID_COMPONENT BIT(8)
+#define CXL_DER_VALID_COMPONENT_ID_FORMAT BIT(9)
+#define CXL_DER_VALID_SUB_CHANNEL BIT(10)
+#define show_dram_valid_flags(flags) __print_flags(flags, "|", \
+ { CXL_DER_VALID_CHANNEL, "CHANNEL" }, \
+ { CXL_DER_VALID_RANK, "RANK" }, \
+ { CXL_DER_VALID_NIBBLE, "NIBBLE" }, \
+ { CXL_DER_VALID_BANK_GROUP, "BANK GROUP" }, \
+ { CXL_DER_VALID_BANK, "BANK" }, \
+ { CXL_DER_VALID_ROW, "ROW" }, \
+ { CXL_DER_VALID_COLUMN, "COLUMN" }, \
+ { CXL_DER_VALID_CORRECTION_MASK, "CORRECTION MASK" }, \
+ { CXL_DER_VALID_COMPONENT, "COMPONENT" }, \
+ { CXL_DER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" }, \
+ { CXL_DER_VALID_SUB_CHANNEL, "SUB CHANNEL" } \
+)
+
+TRACE_EVENT(cxl_dram,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0,
+ struct cxl_event_dram *rec),
+
+ TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+ /* DRAM */
+ __field(u64, dpa)
+ __field(u8, descriptor)
+ __field(u8, type)
+ __field(u8, transaction_type)
+ __field(u8, channel)
+ __field(u16, validity_flags)
+ __field(u16, column) /* Out of order to pack trace record */
+ __field(u32, nibble_mask)
+ __field(u32, row)
+ __array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
+ __field(u64, hpa)
+ __field(u64, hpa_alias0)
+ __field_struct(uuid_t, region_uuid)
+ __field(u8, rank) /* Out of order to pack trace record */
+ __field(u8, bank_group) /* Out of order to pack trace record */
+ __field(u8, bank) /* Out of order to pack trace record */
+ __field(u8, dpa_flags) /* Out of order to pack trace record */
+ /* Following are out of order to pack trace record */
+ __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+ __field(u32, cvme_count)
+ __field(u8, sub_type)
+ __field(u8, sub_channel)
+ __field(u8, cme_threshold_ev_flags)
+ __string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr);
+ __entry->hdr_uuid = CXL_EVENT_DRAM_UUID;
+
+ /* DRAM */
+ __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr);
+ __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK;
+ __entry->dpa &= CXL_DPA_MASK;
+ __entry->descriptor = rec->media_hdr.descriptor;
+ __entry->type = rec->media_hdr.type;
+ __entry->sub_type = rec->sub_type;
+ __entry->transaction_type = rec->media_hdr.transaction_type;
+ __entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags);
+ __entry->channel = rec->media_hdr.channel;
+ __entry->rank = rec->media_hdr.rank;
+ __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask);
+ __entry->bank_group = rec->bank_group;
+ __entry->bank = rec->bank;
+ __entry->row = get_unaligned_le24(rec->row);
+ __entry->column = get_unaligned_le16(rec->column);
+ memcpy(__entry->cor_mask, &rec->correction_mask,
+ CXL_EVENT_DER_CORRECTION_MASK_SIZE);
+ __entry->hpa = hpa;
+ __entry->hpa_alias0 = hpa_alias0;
+ if (cxlr) {
+ __assign_str(region_name);
+ uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+ } else {
+ __assign_str(region_name);
+ uuid_copy(&__entry->region_uuid, &uuid_null);
+ }
+ memcpy(__entry->comp_id, &rec->component_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+ __entry->sub_channel = rec->sub_channel;
+ __entry->cme_threshold_ev_flags = rec->cme_threshold_ev_flags;
+ if (rec->media_hdr.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT)
+ __entry->cvme_count = get_unaligned_le24(rec->cvme_count);
+ else
+ __entry->cvme_count = 0;
+ ),
+
+ CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' descriptor='%s' type='%s' sub_type='%s' " \
+ "transaction_type='%s' channel=%u rank=%u nibble_mask=%x " \
+ "bank_group=%u bank=%u row=%u column=%u cor_mask=%s " \
+ "validity_flags='%s' " \
+ "comp_id=%s comp_id_pldm_valid_flags='%s' " \
+ "pldm_entity_id=%s pldm_resource_id=%s " \
+ "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \
+ "sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u",
+ __entry->dpa, show_dpa_flags(__entry->dpa_flags),
+ show_event_desc_flags(__entry->descriptor),
+ show_dram_mem_event_type(__entry->type),
+ show_mem_event_sub_type(__entry->sub_type),
+ show_trans_type(__entry->transaction_type),
+ __entry->channel, __entry->rank, __entry->nibble_mask,
+ __entry->bank_group, __entry->bank,
+ __entry->row, __entry->column,
+ __print_hex(__entry->cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE),
+ show_dram_valid_flags(__entry->validity_flags),
+ __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+ show_comp_id_pldm_flags(__entry->comp_id[0]),
+ show_pldm_entity_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT,
+ CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT,
+ CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid,
+ __entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags),
+ __entry->cvme_count
+ )
+);
+
+/*
+ * Memory Module Event Record - MMER
+ *
+ * CXL res 3.1 section 8.2.9.2.1.3; Table 8-47
+ */
+#define CXL_MMER_HEALTH_STATUS_CHANGE 0x00
+#define CXL_MMER_MEDIA_STATUS_CHANGE 0x01
+#define CXL_MMER_LIFE_USED_CHANGE 0x02
+#define CXL_MMER_TEMP_CHANGE 0x03
+#define CXL_MMER_DATA_PATH_ERROR 0x04
+#define CXL_MMER_LSA_ERROR 0x05
+#define CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR 0x06
+#define CXL_MMER_MEMORY_MEDIA_FRU_ERROR 0x07
+#define CXL_MMER_POWER_MANAGEMENT_FAULT 0x08
+#define show_dev_evt_type(type) __print_symbolic(type, \
+ { CXL_MMER_HEALTH_STATUS_CHANGE, "Health Status Change" }, \
+ { CXL_MMER_MEDIA_STATUS_CHANGE, "Media Status Change" }, \
+ { CXL_MMER_LIFE_USED_CHANGE, "Life Used Change" }, \
+ { CXL_MMER_TEMP_CHANGE, "Temperature Change" }, \
+ { CXL_MMER_DATA_PATH_ERROR, "Data Path Error" }, \
+ { CXL_MMER_LSA_ERROR, "LSA Error" }, \
+ { CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR, "Unrecoverable Internal Sideband Bus Error" }, \
+ { CXL_MMER_MEMORY_MEDIA_FRU_ERROR, "Memory Media FRU Error" }, \
+ { CXL_MMER_POWER_MANAGEMENT_FAULT, "Power Management Fault" } \
+)
+
+/*
+ * Device Health Information - DHI
+ *
+ * CXL res 3.1 section 8.2.9.9.3.1; Table 8-133
+ */
+#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0)
+#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1)
+#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2)
+#define CXL_DHI_HS_MEM_CAPACITY_DEGRADED BIT(3)
+#define show_health_status_flags(flags) __print_flags(flags, "|", \
+ { CXL_DHI_HS_MAINTENANCE_NEEDED, "MAINTENANCE_NEEDED" }, \
+ { CXL_DHI_HS_PERFORMANCE_DEGRADED, "PERFORMANCE_DEGRADED" }, \
+ { CXL_DHI_HS_HW_REPLACEMENT_NEEDED, "REPLACEMENT_NEEDED" }, \
+ { CXL_DHI_HS_MEM_CAPACITY_DEGRADED, "MEM_CAPACITY_DEGRADED" } \
+)
+
+#define CXL_DHI_MS_NORMAL 0x00
+#define CXL_DHI_MS_NOT_READY 0x01
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOST 0x02
+#define CXL_DHI_MS_ALL_DATA_LOST 0x03
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS 0x04
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN 0x05
+#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT 0x06
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS 0x07
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN 0x08
+#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT 0x09
+#define show_media_status(ms) __print_symbolic(ms, \
+ { CXL_DHI_MS_NORMAL, \
+ "Normal" }, \
+ { CXL_DHI_MS_NOT_READY, \
+ "Not Ready" }, \
+ { CXL_DHI_MS_WRITE_PERSISTENCY_LOST, \
+ "Write Persistency Lost" }, \
+ { CXL_DHI_MS_ALL_DATA_LOST, \
+ "All Data Lost" }, \
+ { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS, \
+ "Write Persistency Loss in the Event of Power Loss" }, \
+ { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN, \
+ "Write Persistency Loss in Event of Shutdown" }, \
+ { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT, \
+ "Write Persistency Loss Imminent" }, \
+ { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS, \
+ "All Data Loss in Event of Power Loss" }, \
+ { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN, \
+ "All Data loss in the Event of Shutdown" }, \
+ { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT, \
+ "All Data Loss Imminent" } \
+)
+
+#define CXL_DHI_AS_NORMAL 0x0
+#define CXL_DHI_AS_WARNING 0x1
+#define CXL_DHI_AS_CRITICAL 0x2
+#define show_two_bit_status(as) __print_symbolic(as, \
+ { CXL_DHI_AS_NORMAL, "Normal" }, \
+ { CXL_DHI_AS_WARNING, "Warning" }, \
+ { CXL_DHI_AS_CRITICAL, "Critical" } \
+)
+#define show_one_bit_status(as) __print_symbolic(as, \
+ { CXL_DHI_AS_NORMAL, "Normal" }, \
+ { CXL_DHI_AS_WARNING, "Warning" } \
+)
+
+#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3)
+#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2)
+#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4)
+#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5)
+
+#define CXL_MMER_VALID_COMPONENT BIT(0)
+#define CXL_MMER_VALID_COMPONENT_ID_FORMAT BIT(1)
+#define show_mem_module_valid_flags(flags) __print_flags(flags, "|", \
+ { CXL_MMER_VALID_COMPONENT, "COMPONENT" }, \
+ { CXL_MMER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" } \
+)
+#define CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED 0x00
+#define CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA 0x01
+#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA 0x02
+#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU 0x03
+#define show_dev_event_sub_type(sub_type) __print_symbolic(sub_type, \
+ { CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED, "Not Reported" }, \
+ { CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA, "Invalid Config Data" }, \
+ { CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA, "Unsupported Config Data" }, \
+ { \
+ CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU, \
+ "Unsupported Memory Media FRU" \
+ } \
+)
+
+TRACE_EVENT(cxl_memory_module,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ struct cxl_event_mem_module *rec),
+
+ TP_ARGS(cxlmd, log, rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+
+ /* Memory Module Event */
+ __field(u8, event_type)
+
+ /* Device Health Info */
+ __field(u8, health_status)
+ __field(u8, media_status)
+ __field(u8, life_used)
+ __field(u32, dirty_shutdown_cnt)
+ __field(u32, cor_vol_err_cnt)
+ __field(u32, cor_per_err_cnt)
+ __field(s16, device_temp)
+ __field(u8, add_status)
+ __field(u8, event_sub_type)
+ __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+ __field(u16, validity_flags)
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+ __entry->hdr_uuid = CXL_EVENT_MEM_MODULE_UUID;
+
+ /* Memory Module Event */
+ __entry->event_type = rec->event_type;
+ __entry->event_sub_type = rec->event_sub_type;
+
+ /* Device Health Info */
+ __entry->health_status = rec->info.health_status;
+ __entry->media_status = rec->info.media_status;
+ __entry->life_used = rec->info.life_used;
+ __entry->dirty_shutdown_cnt = get_unaligned_le32(rec->info.dirty_shutdown_cnt);
+ __entry->cor_vol_err_cnt = get_unaligned_le32(rec->info.cor_vol_err_cnt);
+ __entry->cor_per_err_cnt = get_unaligned_le32(rec->info.cor_per_err_cnt);
+ __entry->device_temp = get_unaligned_le16(rec->info.device_temp);
+ __entry->add_status = rec->info.add_status;
+ __entry->validity_flags = get_unaligned_le16(rec->validity_flags);
+ memcpy(__entry->comp_id, &rec->component_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+ ),
+
+ CXL_EVT_TP_printk("event_type='%s' event_sub_type='%s' health_status='%s' " \
+ "media_status='%s' as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \
+ "as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \
+ "dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u " \
+ "validity_flags='%s' " \
+ "comp_id=%s comp_id_pldm_valid_flags='%s' " \
+ "pldm_entity_id=%s pldm_resource_id=%s",
+ show_dev_evt_type(__entry->event_type),
+ show_dev_event_sub_type(__entry->event_sub_type),
+ show_health_status_flags(__entry->health_status),
+ show_media_status(__entry->media_status),
+ show_two_bit_status(CXL_DHI_AS_LIFE_USED(__entry->add_status)),
+ show_two_bit_status(CXL_DHI_AS_DEV_TEMP(__entry->add_status)),
+ show_one_bit_status(CXL_DHI_AS_COR_VOL_ERR_CNT(__entry->add_status)),
+ show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry->add_status)),
+ __entry->life_used, __entry->device_temp,
+ __entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt,
+ __entry->cor_per_err_cnt,
+ show_mem_module_valid_flags(__entry->validity_flags),
+ __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+ show_comp_id_pldm_flags(__entry->comp_id[0]),
+ show_pldm_entity_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT,
+ CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ show_pldm_resource_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT,
+ CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id)
+ )
+);
+
+/*
+ * Memory Sparing Event Record - MSER
+ *
+ * CXL rev 3.2 section 8.2.10.2.1.4; Table 8-60
+ */
+#define CXL_MSER_QUERY_RESOURCE_FLAG BIT(0)
+#define CXL_MSER_HARD_SPARING_FLAG BIT(1)
+#define CXL_MSER_DEV_INITED_FLAG BIT(2)
+#define show_mem_sparing_flags(flags) __print_flags(flags, "|", \
+ { CXL_MSER_QUERY_RESOURCE_FLAG, "Query Resources" }, \
+ { CXL_MSER_HARD_SPARING_FLAG, "Hard Sparing" }, \
+ { CXL_MSER_DEV_INITED_FLAG, "Device Initiated Sparing" } \
+)
+
+#define CXL_MSER_VALID_CHANNEL BIT(0)
+#define CXL_MSER_VALID_RANK BIT(1)
+#define CXL_MSER_VALID_NIBBLE BIT(2)
+#define CXL_MSER_VALID_BANK_GROUP BIT(3)
+#define CXL_MSER_VALID_BANK BIT(4)
+#define CXL_MSER_VALID_ROW BIT(5)
+#define CXL_MSER_VALID_COLUMN BIT(6)
+#define CXL_MSER_VALID_COMPONENT_ID BIT(7)
+#define CXL_MSER_VALID_COMPONENT_ID_FORMAT BIT(8)
+#define CXL_MSER_VALID_SUB_CHANNEL BIT(9)
+#define show_mem_sparing_valid_flags(flags) __print_flags(flags, "|", \
+ { CXL_MSER_VALID_CHANNEL, "CHANNEL" }, \
+ { CXL_MSER_VALID_RANK, "RANK" }, \
+ { CXL_MSER_VALID_NIBBLE, "NIBBLE" }, \
+ { CXL_MSER_VALID_BANK_GROUP, "BANK GROUP" }, \
+ { CXL_MSER_VALID_BANK, "BANK" }, \
+ { CXL_MSER_VALID_ROW, "ROW" }, \
+ { CXL_MSER_VALID_COLUMN, "COLUMN" }, \
+ { CXL_MSER_VALID_COMPONENT_ID, "COMPONENT ID" }, \
+ { CXL_MSER_VALID_COMPONENT_ID_FORMAT, "COMPONENT ID PLDM FORMAT" }, \
+ { CXL_MSER_VALID_SUB_CHANNEL, "SUB CHANNEL" } \
+)
+
+TRACE_EVENT(cxl_memory_sparing,
+
+ TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
+ struct cxl_event_mem_sparing *rec),
+
+ TP_ARGS(cxlmd, log, rec),
+
+ TP_STRUCT__entry(
+ CXL_EVT_TP_entry
+
+ /* Memory Sparing Event */
+ __field(u8, flags)
+ __field(u8, result)
+ __field(u16, validity_flags)
+ __field(u16, res_avail)
+ __field(u8, channel)
+ __field(u8, rank)
+ __field(u32, nibble_mask)
+ __field(u8, bank_group)
+ __field(u8, bank)
+ __field(u32, row)
+ __field(u16, column)
+ __field(u8, sub_channel)
+ __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
+ ),
+
+ TP_fast_assign(
+ CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr);
+ __entry->hdr_uuid = CXL_EVENT_MEM_SPARING_UUID;
+
+ /* Memory Sparing Event */
+ __entry->flags = rec->flags;
+ __entry->result = rec->result;
+ __entry->validity_flags = le16_to_cpu(rec->validity_flags);
+ __entry->res_avail = le16_to_cpu(rec->res_avail);
+ __entry->channel = rec->channel;
+ __entry->rank = rec->rank;
+ __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask);
+ __entry->bank_group = rec->bank_group;
+ __entry->bank = rec->bank;
+ __entry->row = get_unaligned_le24(rec->row);
+ __entry->column = le16_to_cpu(rec->column);
+ __entry->sub_channel = rec->sub_channel;
+ memcpy(__entry->comp_id, &rec->component_id,
+ CXL_EVENT_GEN_MED_COMP_ID_SIZE);
+ ),
+
+ CXL_EVT_TP_printk("flags='%s' result=%u validity_flags='%s' " \
+ "spare resource avail=%u channel=%u rank=%u " \
+ "nibble_mask=%x bank_group=%u bank=%u " \
+ "row=%u column=%u sub_channel=%u " \
+ "comp_id=%s comp_id_pldm_valid_flags='%s' " \
+ "pldm_entity_id=%s pldm_resource_id=%s",
+ show_mem_sparing_flags(__entry->flags),
+ __entry->result,
+ show_mem_sparing_valid_flags(__entry->validity_flags),
+ __entry->res_avail, __entry->channel, __entry->rank,
+ __entry->nibble_mask, __entry->bank_group, __entry->bank,
+ __entry->row, __entry->column, __entry->sub_channel,
+ __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
+ show_comp_id_pldm_flags(__entry->comp_id[0]),
+ show_pldm_entity_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID,
+ CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id),
+ show_pldm_resource_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID,
+ CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id)
+ )
+);
+
+#define show_poison_trace_type(type) \
+ __print_symbolic(type, \
+ { CXL_POISON_TRACE_LIST, "List" }, \
+ { CXL_POISON_TRACE_INJECT, "Inject" }, \
+ { CXL_POISON_TRACE_CLEAR, "Clear" })
+
+#define __show_poison_source(source) \
+ __print_symbolic(source, \
+ { CXL_POISON_SOURCE_UNKNOWN, "Unknown" }, \
+ { CXL_POISON_SOURCE_EXTERNAL, "External" }, \
+ { CXL_POISON_SOURCE_INTERNAL, "Internal" }, \
+ { CXL_POISON_SOURCE_INJECTED, "Injected" }, \
+ { CXL_POISON_SOURCE_VENDOR, "Vendor" })
+
+#define show_poison_source(source) \
+ (((source > CXL_POISON_SOURCE_INJECTED) && \
+ (source != CXL_POISON_SOURCE_VENDOR)) ? "Reserved" \
+ : __show_poison_source(source))
+
+#define show_poison_flags(flags) \
+ __print_flags(flags, "|", \
+ { CXL_POISON_FLAG_MORE, "More" }, \
+ { CXL_POISON_FLAG_OVERFLOW, "Overflow" }, \
+ { CXL_POISON_FLAG_SCANNING, "Scanning" })
+
+#define __cxl_poison_addr(record) \
+ (le64_to_cpu(record->address))
+#define cxl_poison_record_dpa(record) \
+ (__cxl_poison_addr(record) & CXL_POISON_START_MASK)
+#define cxl_poison_record_source(record) \
+ (__cxl_poison_addr(record) & CXL_POISON_SOURCE_MASK)
+#define cxl_poison_record_dpa_length(record) \
+ (le32_to_cpu(record->length) * CXL_POISON_LEN_MULT)
+#define cxl_poison_overflow(flags, time) \
+ (flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0)
+
+TRACE_EVENT(cxl_poison,
+
+ TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *cxlr,
+ const struct cxl_poison_record *record, u8 flags,
+ __le64 overflow_ts, enum cxl_poison_trace_type trace_type),
+
+ TP_ARGS(cxlmd, cxlr, record, flags, overflow_ts, trace_type),
+
+ TP_STRUCT__entry(
+ __string(memdev, dev_name(&cxlmd->dev))
+ __string(host, dev_name(cxlmd->dev.parent))
+ __field(u64, serial)
+ __field(u8, trace_type)
+ __string(region, cxlr ? dev_name(&cxlr->dev) : "")
+ __field(u64, overflow_ts)
+ __field(u64, hpa)
+ __field(u64, hpa_alias0)
+ __field(u64, dpa)
+ __field(u32, dpa_length)
+ __array(char, uuid, 16)
+ __field(u8, source)
+ __field(u8, flags)
+ ),
+
+ TP_fast_assign(
+ __assign_str(memdev);
+ __assign_str(host);
+ __entry->serial = cxlmd->cxlds->serial;
+ __entry->overflow_ts = cxl_poison_overflow(flags, overflow_ts);
+ __entry->dpa = cxl_poison_record_dpa(record);
+ __entry->dpa_length = cxl_poison_record_dpa_length(record);
+ __entry->source = cxl_poison_record_source(record);
+ __entry->trace_type = trace_type;
+ __entry->flags = flags;
+ if (cxlr) {
+ __assign_str(region);
+ memcpy(__entry->uuid, &cxlr->params.uuid, 16);
+ __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd,
+ __entry->dpa);
+ if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size)
+ __entry->hpa_alias0 = __entry->hpa -
+ cxlr->params.cache_size;
+ else
+ __entry->hpa_alias0 = ULLONG_MAX;
+ } else {
+ __assign_str(region);
+ memset(__entry->uuid, 0, 16);
+ __entry->hpa = ULLONG_MAX;
+ __entry->hpa_alias0 = ULLONG_MAX;
+ }
+ ),
+
+ TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \
+ "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \
+ "dpa_length=0x%x source=%s flags=%s overflow_time=%llu",
+ __get_str(memdev),
+ __get_str(host),
+ __entry->serial,
+ show_poison_trace_type(__entry->trace_type),
+ __get_str(region),
+ __entry->uuid,
+ __entry->hpa,
+ __entry->hpa_alias0,
+ __entry->dpa,
+ __entry->dpa_length,
+ show_poison_source(__entry->source),
+ show_poison_flags(__entry->flags),
+ __entry->overflow_ts
+ )
+);
+
+#endif /* _CXL_EVENTS_H */
+
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index b6bda39a59e3..ba17fa86d249 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -6,8 +6,14 @@
#include <linux/libnvdimm.h>
#include <linux/bitfield.h>
+#include <linux/notifier.h>
#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/node.h>
#include <linux/io.h>
+#include <linux/range.h>
+
+extern const struct nvdimm_security_ops *cxl_security_ops;
/**
* DOC: cxl objects
@@ -17,6 +23,9 @@
* (port-driver, region-driver, nvdimm object-drivers... etc).
*/
+/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */
+#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K
+
/* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/
#define CXL_CM_OFFSET 0x1000
#define CXL_CM_CAP_HDR_OFFSET 0x0
@@ -29,6 +38,7 @@
#define CXL_CM_CAP_HDR_ARRAY_SIZE_MASK GENMASK(31, 24)
#define CXL_CM_CAP_PTR_MASK GENMASK(31, 20)
+#define CXL_CM_CAP_CAP_ID_RAS 0x2
#define CXL_CM_CAP_CAP_ID_HDM 0x5
#define CXL_CM_CAP_CAP_HDM_VERSION 1
@@ -36,11 +46,32 @@
#define CXL_HDM_DECODER_CAP_OFFSET 0x0
#define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0)
#define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4)
-#define CXL_HDM_DECODER0_BASE_LOW_OFFSET 0x10
-#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET 0x14
-#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET 0x18
-#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET 0x1c
-#define CXL_HDM_DECODER0_CTRL_OFFSET 0x20
+#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8)
+#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9)
+#define CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY BIT(11)
+#define CXL_HDM_DECODER_INTERLEAVE_16_WAY BIT(12)
+#define CXL_HDM_DECODER_CTRL_OFFSET 0x4
+#define CXL_HDM_DECODER_ENABLE BIT(1)
+#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10)
+#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14)
+#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18)
+#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c)
+#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20)
+#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0)
+#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4)
+#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8)
+#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9)
+#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10)
+#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11)
+#define CXL_HDM_DECODER0_CTRL_HOSTONLY BIT(12)
+#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24)
+#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28)
+#define CXL_HDM_DECODER0_SKIP_LOW(i) CXL_HDM_DECODER0_TL_LOW(i)
+#define CXL_HDM_DECODER0_SKIP_HIGH(i) CXL_HDM_DECODER0_TL_HIGH(i)
+
+/* HDM decoder control register constants CXL 3.0 8.2.5.19.7 */
+#define CXL_DECODER_MIN_GRANULARITY 256
+#define CXL_DECODER_MAX_ENCODED_IG 6
static inline int cxl_hdm_decoder_count(u32 cap_hdr)
{
@@ -49,6 +80,77 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
return val ? val * 2 : 1;
}
+/* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */
+static inline int eig_to_granularity(u16 eig, unsigned int *granularity)
+{
+ if (eig > CXL_DECODER_MAX_ENCODED_IG)
+ return -EINVAL;
+ *granularity = CXL_DECODER_MIN_GRANULARITY << eig;
+ return 0;
+}
+
+/* Encode defined in CXL ECN "3, 6, 12 and 16-way memory Interleaving" */
+static inline int eiw_to_ways(u8 eiw, unsigned int *ways)
+{
+ switch (eiw) {
+ case 0 ... 4:
+ *ways = 1 << eiw;
+ break;
+ case 8 ... 10:
+ *ways = 3 << (eiw - 8);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static inline int granularity_to_eig(int granularity, u16 *eig)
+{
+ if (granularity > SZ_16K || granularity < CXL_DECODER_MIN_GRANULARITY ||
+ !is_power_of_2(granularity))
+ return -EINVAL;
+ *eig = ilog2(granularity) - 8;
+ return 0;
+}
+
+static inline int ways_to_eiw(unsigned int ways, u8 *eiw)
+{
+ if (ways > 16)
+ return -EINVAL;
+ if (is_power_of_2(ways)) {
+ *eiw = ilog2(ways);
+ return 0;
+ }
+ if (ways % 3)
+ return -EINVAL;
+ ways /= 3;
+ if (!is_power_of_2(ways))
+ return -EINVAL;
+ *eiw = ilog2(ways) + 8;
+ return 0;
+}
+
+/* RAS Registers CXL 2.0 8.2.5.9 CXL RAS Capability Structure */
+#define CXL_RAS_UNCORRECTABLE_STATUS_OFFSET 0x0
+#define CXL_RAS_UNCORRECTABLE_STATUS_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define CXL_RAS_UNCORRECTABLE_MASK_OFFSET 0x4
+#define CXL_RAS_UNCORRECTABLE_MASK_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK BIT(8)
+#define CXL_RAS_UNCORRECTABLE_SEVERITY_OFFSET 0x8
+#define CXL_RAS_UNCORRECTABLE_SEVERITY_MASK (GENMASK(16, 14) | GENMASK(11, 0))
+#define CXL_RAS_CORRECTABLE_STATUS_OFFSET 0xC
+#define CXL_RAS_CORRECTABLE_STATUS_MASK GENMASK(6, 0)
+#define CXL_RAS_CORRECTABLE_MASK_OFFSET 0x10
+#define CXL_RAS_CORRECTABLE_MASK_MASK GENMASK(6, 0)
+#define CXL_RAS_CAP_CONTROL_OFFSET 0x14
+#define CXL_RAS_CAP_CONTROL_FE_MASK GENMASK(5, 0)
+#define CXL_RAS_HEADER_LOG_OFFSET 0x18
+#define CXL_RAS_CAPABILITY_LENGTH 0x58
+#define CXL_HEADERLOG_SIZE SZ_512
+#define CXL_HEADERLOG_SIZE_U32 SZ_512 / sizeof(u32)
+
/* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
#define CXLDEV_CAP_ARRAY_OFFSET 0x0
#define CXLDEV_CAP_ARRAY_CAP_ID 0
@@ -62,75 +164,98 @@ static inline int cxl_hdm_decoder_count(u32 cap_hdr)
#define CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX 0x3
#define CXLDEV_CAP_CAP_ID_MEMDEV 0x4000
+/* CXL 3.0 8.2.8.3.1 Event Status Register */
+#define CXLDEV_DEV_EVENT_STATUS_OFFSET 0x00
+#define CXLDEV_EVENT_STATUS_INFO BIT(0)
+#define CXLDEV_EVENT_STATUS_WARN BIT(1)
+#define CXLDEV_EVENT_STATUS_FAIL BIT(2)
+#define CXLDEV_EVENT_STATUS_FATAL BIT(3)
+
+#define CXLDEV_EVENT_STATUS_ALL (CXLDEV_EVENT_STATUS_INFO | \
+ CXLDEV_EVENT_STATUS_WARN | \
+ CXLDEV_EVENT_STATUS_FAIL | \
+ CXLDEV_EVENT_STATUS_FATAL)
+
+/* CXL rev 3.0 section 8.2.9.2.4; Table 8-52 */
+#define CXLDEV_EVENT_INT_MODE_MASK GENMASK(1, 0)
+#define CXLDEV_EVENT_INT_MSGNUM_MASK GENMASK(7, 4)
+
/* CXL 2.0 8.2.8.4 Mailbox Registers */
#define CXLDEV_MBOX_CAPS_OFFSET 0x00
#define CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK GENMASK(4, 0)
+#define CXLDEV_MBOX_CAP_BG_CMD_IRQ BIT(6)
+#define CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK GENMASK(10, 7)
#define CXLDEV_MBOX_CTRL_OFFSET 0x04
#define CXLDEV_MBOX_CTRL_DOORBELL BIT(0)
+#define CXLDEV_MBOX_CTRL_BG_CMD_IRQ BIT(2)
#define CXLDEV_MBOX_CMD_OFFSET 0x08
#define CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
#define CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK GENMASK_ULL(36, 16)
#define CXLDEV_MBOX_STATUS_OFFSET 0x10
+#define CXLDEV_MBOX_STATUS_BG_CMD BIT(0)
#define CXLDEV_MBOX_STATUS_RET_CODE_MASK GENMASK_ULL(47, 32)
#define CXLDEV_MBOX_BG_CMD_STATUS_OFFSET 0x18
+#define CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK GENMASK_ULL(15, 0)
+#define CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK GENMASK_ULL(22, 16)
+#define CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK GENMASK_ULL(47, 32)
+#define CXLDEV_MBOX_BG_CMD_COMMAND_VENDOR_MASK GENMASK_ULL(63, 48)
#define CXLDEV_MBOX_PAYLOAD_OFFSET 0x20
-#define CXL_COMPONENT_REGS() \
- void __iomem *hdm_decoder
-
-#define CXL_DEVICE_REGS() \
- void __iomem *status; \
- void __iomem *mbox; \
- void __iomem *memdev
-
-/* See note for 'struct cxl_regs' for the rationale of this organization */
-/*
- * CXL_COMPONENT_REGS - Common set of CXL Component register block base pointers
- * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure
- */
-struct cxl_component_regs {
- CXL_COMPONENT_REGS();
-};
-
-/* See note for 'struct cxl_regs' for the rationale of this organization */
-/*
- * CXL_DEVICE_REGS - Common set of CXL Device register block base pointers
- * @status: CXL 2.0 8.2.8.3 Device Status Registers
- * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers
- * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers
- */
-struct cxl_device_regs {
- CXL_DEVICE_REGS();
-};
-
/*
- * Note, the anonymous union organization allows for per
- * register-block-type helper routines, without requiring block-type
- * agnostic code to include the prefix.
+ * Using struct_group() allows for per register-block-type helper routines,
+ * without requiring block-type agnostic code to include the prefix.
*/
struct cxl_regs {
- union {
- struct {
- CXL_COMPONENT_REGS();
- };
- struct cxl_component_regs component;
- };
- union {
- struct {
- CXL_DEVICE_REGS();
- };
- struct cxl_device_regs device_regs;
- };
+ /*
+ * Common set of CXL Component register block base pointers
+ * @hdm_decoder: CXL 2.0 8.2.5.12 CXL HDM Decoder Capability Structure
+ * @ras: CXL 2.0 8.2.5.9 CXL RAS Capability Structure
+ */
+ struct_group_tagged(cxl_component_regs, component,
+ void __iomem *hdm_decoder;
+ void __iomem *ras;
+ );
+ /*
+ * Common set of CXL Device register block base pointers
+ * @status: CXL 2.0 8.2.8.3 Device Status Registers
+ * @mbox: CXL 2.0 8.2.8.4 Mailbox Registers
+ * @memdev: CXL 2.0 8.2.8.5 Memory Device Registers
+ */
+ struct_group_tagged(cxl_device_regs, device_regs,
+ void __iomem *status, *mbox, *memdev;
+ );
+
+ struct_group_tagged(cxl_pmu_regs, pmu_regs,
+ void __iomem *pmu;
+ );
+
+ /*
+ * RCH downstream port specific RAS register
+ * @aer: CXL 3.0 8.2.1.1 RCH Downstream Port RCRB
+ */
+ struct_group_tagged(cxl_rch_regs, rch_regs,
+ void __iomem *dport_aer;
+ );
+
+ /*
+ * RCD upstream port specific PCIe cap register
+ * @pcie_cap: CXL 3.0 8.2.1.2 RCD Upstream Port RCRB
+ */
+ struct_group_tagged(cxl_rcd_regs, rcd_regs,
+ void __iomem *rcd_pcie_cap;
+ );
};
struct cxl_reg_map {
bool valid;
+ int id;
unsigned long offset;
unsigned long size;
};
struct cxl_component_reg_map {
struct cxl_reg_map hdm_decoder;
+ struct cxl_reg_map ras;
};
struct cxl_device_reg_map {
@@ -139,14 +264,31 @@ struct cxl_device_reg_map {
struct cxl_reg_map memdev;
};
+struct cxl_pmu_reg_map {
+ struct cxl_reg_map pmu;
+};
+
+/**
+ * struct cxl_register_map - DVSEC harvested register block mapping parameters
+ * @host: device for devm operations and logging
+ * @base: virtual base of the register-block-BAR + @block_offset
+ * @resource: physical resource base of the register block
+ * @max_size: maximum mapping size to perform register search
+ * @reg_type: see enum cxl_regloc_type
+ * @component_map: cxl_reg_map for component registers
+ * @device_map: cxl_reg_maps for device registers
+ * @pmu_map: cxl_reg_maps for CXL Performance Monitoring Units
+ */
struct cxl_register_map {
- struct list_head list;
- u64 block_offset;
+ struct device *host;
+ void __iomem *base;
+ resource_size_t resource;
+ resource_size_t max_size;
u8 reg_type;
- u8 barno;
union {
struct cxl_component_reg_map component_map;
struct cxl_device_reg_map device_map;
+ struct cxl_pmu_reg_map pmu_map;
};
};
@@ -154,12 +296,25 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base,
struct cxl_component_reg_map *map);
void cxl_probe_device_regs(struct device *dev, void __iomem *base,
struct cxl_device_reg_map *map);
-int cxl_map_component_regs(struct pci_dev *pdev,
+int cxl_map_component_regs(const struct cxl_register_map *map,
struct cxl_component_regs *regs,
- struct cxl_register_map *map);
-int cxl_map_device_regs(struct pci_dev *pdev,
- struct cxl_device_regs *regs,
- struct cxl_register_map *map);
+ unsigned long map_mask);
+int cxl_map_device_regs(const struct cxl_register_map *map,
+ struct cxl_device_regs *regs);
+int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs);
+
+#define CXL_INSTANCES_COUNT -1
+enum cxl_regloc_type;
+int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type);
+int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map, unsigned int index);
+int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map);
+int cxl_setup_regs(struct cxl_register_map *map);
+struct cxl_dport;
+resource_size_t cxl_rcd_component_reg_phys(struct device *dev,
+ struct cxl_dport *dport);
+int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport);
#define CXL_RESOURCE_NONE ((resource_size_t) -1)
#define CXL_TARGET_STRLEN 20
@@ -168,62 +323,274 @@ int cxl_map_device_regs(struct pci_dev *pdev,
* cxl_decoder flags that define the type of memory / devices this
* decoder supports as well as configuration lock status See "CXL 2.0
* 8.2.5.12.7 CXL HDM Decoder 0 Control Register" for details.
+ * Additionally indicate whether decoder settings were autodetected,
+ * user customized.
*/
#define CXL_DECODER_F_RAM BIT(0)
#define CXL_DECODER_F_PMEM BIT(1)
#define CXL_DECODER_F_TYPE2 BIT(2)
#define CXL_DECODER_F_TYPE3 BIT(3)
#define CXL_DECODER_F_LOCK BIT(4)
-#define CXL_DECODER_F_MASK GENMASK(4, 0)
+#define CXL_DECODER_F_ENABLE BIT(5)
+#define CXL_DECODER_F_MASK GENMASK(5, 0)
enum cxl_decoder_type {
- CXL_DECODER_ACCELERATOR = 2,
- CXL_DECODER_EXPANDER = 3,
+ CXL_DECODER_DEVMEM = 2,
+ CXL_DECODER_HOSTONLYMEM = 3,
};
+/*
+ * Current specification goes up to 8, double that seems a reasonable
+ * software max for the foreseeable future
+ */
+#define CXL_DECODER_MAX_INTERLEAVE 16
+
+#define CXL_QOS_CLASS_INVALID -1
+
/**
- * struct cxl_decoder - CXL address range decode configuration
+ * struct cxl_decoder - Common CXL HDM Decoder Attributes
* @dev: this decoder's device
* @id: kernel device name id
- * @range: address range considered by this decoder
+ * @hpa_range: Host physical address range mapped by this decoder
* @interleave_ways: number of cxl_dports in this decode
* @interleave_granularity: data stride per dport
* @target_type: accelerator vs expander (type2 vs type3) selector
+ * @region: currently assigned region for this decoder
* @flags: memory type capabilities and locking
- * @target: active ordered target list in current decoder configuration
- */
+ * @target_map: cached copy of hardware port-id list, available at init
+ * before all @dport objects have been instantiated. While
+ * dport id is 8bit, CFMWS interleave targets are 32bits.
+ * @commit: device/decoder-type specific callback to commit settings to hw
+ * @reset: device/decoder-type specific callback to reset hw settings
+*/
struct cxl_decoder {
struct device dev;
int id;
- struct range range;
+ struct range hpa_range;
int interleave_ways;
int interleave_granularity;
enum cxl_decoder_type target_type;
+ struct cxl_region *region;
unsigned long flags;
+ u32 target_map[CXL_DECODER_MAX_INTERLEAVE];
+ int (*commit)(struct cxl_decoder *cxld);
+ void (*reset)(struct cxl_decoder *cxld);
+};
+
+/*
+ * Track whether this decoder is reserved for region autodiscovery, or
+ * free for userspace provisioning.
+ */
+enum cxl_decoder_state {
+ CXL_DECODER_STATE_MANUAL,
+ CXL_DECODER_STATE_AUTO,
+};
+
+/**
+ * struct cxl_endpoint_decoder - Endpoint / SPA to DPA decoder
+ * @cxld: base cxl_decoder_object
+ * @dpa_res: actively claimed DPA span of this decoder
+ * @skip: offset into @dpa_res where @cxld.hpa_range maps
+ * @state: autodiscovery state
+ * @part: partition index this decoder maps
+ * @pos: interleave position in @cxld.region
+ */
+struct cxl_endpoint_decoder {
+ struct cxl_decoder cxld;
+ struct resource *dpa_res;
+ resource_size_t skip;
+ enum cxl_decoder_state state;
+ int part;
+ int pos;
+};
+
+/**
+ * struct cxl_switch_decoder - Switch specific CXL HDM Decoder
+ * @cxld: base cxl_decoder object
+ * @nr_targets: number of elements in @target
+ * @target: active ordered target list in current decoder configuration
+ *
+ * The 'switch' decoder type represents the decoder instances of cxl_port's that
+ * route from the root of a CXL memory decode topology to the endpoints. They
+ * come in two flavors, root-level decoders, statically defined by platform
+ * firmware, and mid-level decoders, where interleave-granularity,
+ * interleave-width, and the target list are mutable.
+ */
+struct cxl_switch_decoder {
+ struct cxl_decoder cxld;
+ int nr_targets;
struct cxl_dport *target[];
};
+struct cxl_root_decoder;
+/**
+ * struct cxl_rd_ops - CXL root decoder callback operations
+ * @hpa_to_spa: Convert host physical address to system physical address
+ * @spa_to_hpa: Convert system physical address to host physical address
+ */
+struct cxl_rd_ops {
+ u64 (*hpa_to_spa)(struct cxl_root_decoder *cxlrd, u64 hpa);
+ u64 (*spa_to_hpa)(struct cxl_root_decoder *cxlrd, u64 spa);
+};
-enum cxl_nvdimm_brige_state {
- CXL_NVB_NEW,
- CXL_NVB_DEAD,
- CXL_NVB_ONLINE,
- CXL_NVB_OFFLINE,
+/**
+ * struct cxl_root_decoder - Static platform CXL address decoder
+ * @res: host / parent resource for region allocations
+ * @cache_size: extended linear cache size if exists, otherwise zero.
+ * @region_id: region id for next region provisioning event
+ * @platform_data: platform specific configuration data
+ * @range_lock: sync region autodiscovery by address range
+ * @qos_class: QoS performance class cookie
+ * @ops: CXL root decoder operations
+ * @cxlsd: base cxl switch decoder
+ */
+struct cxl_root_decoder {
+ struct resource *res;
+ resource_size_t cache_size;
+ atomic_t region_id;
+ void *platform_data;
+ struct mutex range_lock;
+ int qos_class;
+ struct cxl_rd_ops ops;
+ struct cxl_switch_decoder cxlsd;
+};
+
+/*
+ * enum cxl_config_state - State machine for region configuration
+ * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely
+ * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more
+ * changes to interleave_ways or interleave_granularity
+ * @CXL_CONFIG_ACTIVE: All targets have been added the region is now
+ * active
+ * @CXL_CONFIG_RESET_PENDING: see commit_store()
+ * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware
+ */
+enum cxl_config_state {
+ CXL_CONFIG_IDLE,
+ CXL_CONFIG_INTERLEAVE_ACTIVE,
+ CXL_CONFIG_ACTIVE,
+ CXL_CONFIG_RESET_PENDING,
+ CXL_CONFIG_COMMIT,
+};
+
+/**
+ * struct cxl_region_params - region settings
+ * @state: allow the driver to lockdown further parameter changes
+ * @uuid: unique id for persistent regions
+ * @interleave_ways: number of endpoints in the region
+ * @interleave_granularity: capacity each endpoint contributes to a stripe
+ * @res: allocated iomem capacity for this region
+ * @targets: active ordered targets in current decoder configuration
+ * @nr_targets: number of targets
+ * @cache_size: extended linear cache size if exists, otherwise zero.
+ *
+ * State transitions are protected by cxl_rwsem.region
+ */
+struct cxl_region_params {
+ enum cxl_config_state state;
+ uuid_t uuid;
+ int interleave_ways;
+ int interleave_granularity;
+ struct resource *res;
+ struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE];
+ int nr_targets;
+ resource_size_t cache_size;
+};
+
+enum cxl_partition_mode {
+ CXL_PARTMODE_RAM,
+ CXL_PARTMODE_PMEM,
+};
+
+/*
+ * Indicate whether this region has been assembled by autodetection or
+ * userspace assembly. Prevent endpoint decoders outside of automatic
+ * detection from being added to the region.
+ */
+#define CXL_REGION_F_AUTO 0
+
+/*
+ * Require that a committed region successfully complete a teardown once
+ * any of its associated decoders have been torn down. This maintains
+ * the commit state for the region since there are committed decoders,
+ * but blocks cxl_region_probe().
+ */
+#define CXL_REGION_F_NEEDS_RESET 1
+
+/*
+ * Indicate whether this region is locked due to 1 or more decoders that have
+ * been locked. The approach of all or nothing is taken with regard to the
+ * locked attribute. CXL_REGION_F_NEEDS_RESET should not be set if this flag is
+ * set.
+ */
+#define CXL_REGION_F_LOCK 2
+
+/**
+ * struct cxl_region - CXL region
+ * @dev: This region's device
+ * @id: This region's id. Id is globally unique across all regions
+ * @mode: Operational mode of the mapped capacity
+ * @type: Endpoint decoder target type
+ * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown
+ * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge
+ * @flags: Region state flags
+ * @params: active + config params for the region
+ * @coord: QoS access coordinates for the region
+ * @node_notifier: notifier for setting the access coordinates to node
+ * @adist_notifier: notifier for calculating the abstract distance of node
+ */
+struct cxl_region {
+ struct device dev;
+ int id;
+ enum cxl_partition_mode mode;
+ enum cxl_decoder_type type;
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct cxl_pmem_region *cxlr_pmem;
+ unsigned long flags;
+ struct cxl_region_params params;
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ struct notifier_block node_notifier;
+ struct notifier_block adist_notifier;
};
struct cxl_nvdimm_bridge {
+ int id;
struct device dev;
struct cxl_port *port;
struct nvdimm_bus *nvdimm_bus;
struct nvdimm_bus_descriptor nd_desc;
- struct work_struct state_work;
- enum cxl_nvdimm_brige_state state;
};
+#define CXL_DEV_ID_LEN 19
+
struct cxl_nvdimm {
struct device dev;
struct cxl_memdev *cxlmd;
- struct nvdimm *nvdimm;
+ u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */
+ u64 dirty_shutdowns;
+};
+
+struct cxl_pmem_region_mapping {
+ struct cxl_memdev *cxlmd;
+ struct cxl_nvdimm *cxl_nvd;
+ u64 start;
+ u64 size;
+ int position;
+};
+
+struct cxl_pmem_region {
+ struct device dev;
+ struct cxl_region *cxlr;
+ struct nd_region *nd_region;
+ struct range hpa_range;
+ int nr_mappings;
+ struct cxl_pmem_region_mapping mapping[];
+};
+
+struct cxl_dax_region {
+ struct device dev;
+ struct cxl_region *cxlr;
+ struct range hpa_range;
};
/**
@@ -231,68 +598,266 @@ struct cxl_nvdimm {
* downstream port devices to construct a CXL memory
* decode hierarchy.
* @dev: this port's device
- * @uport: PCI or platform device implementing the upstream port capability
+ * @uport_dev: PCI or platform device implementing the upstream port capability
+ * @host_bridge: Shortcut to the platform attach point for this port
* @id: id for port device-name
* @dports: cxl_dport instances referenced by decoders
+ * @endpoints: cxl_ep instances, endpoints that are a descendant of this port
+ * @regions: cxl_region_ref instances, regions mapped by this port
+ * @parent_dport: dport that points to this port in the parent
* @decoder_ida: allocator for decoder ids
- * @component_reg_phys: component register capability base address (optional)
+ * @reg_map: component and ras register mapping parameters
+ * @nr_dports: number of entries in @dports
+ * @hdm_end: track last allocated HDM decoder instance for allocation ordering
+ * @commit_end: cursor to track highest committed decoder for commit ordering
+ * @dead: last ep has been removed, force port re-creation
+ * @depth: How deep this port is relative to the root. depth 0 is the root.
+ * @cdat: Cached CDAT data
+ * @cdat_available: Should a CDAT attribute be available in sysfs
+ * @pci_latency: Upstream latency in picoseconds
+ * @component_reg_phys: Physical address of component register
*/
struct cxl_port {
struct device dev;
- struct device *uport;
+ struct device *uport_dev;
+ struct device *host_bridge;
int id;
- struct list_head dports;
+ struct xarray dports;
+ struct xarray endpoints;
+ struct xarray regions;
+ struct cxl_dport *parent_dport;
struct ida decoder_ida;
+ struct cxl_register_map reg_map;
+ int nr_dports;
+ int hdm_end;
+ int commit_end;
+ bool dead;
+ unsigned int depth;
+ struct cxl_cdat {
+ void *table;
+ size_t length;
+ } cdat;
+ bool cdat_available;
+ long pci_latency;
resource_size_t component_reg_phys;
};
/**
+ * struct cxl_root - logical collection of root cxl_port items
+ *
+ * @port: cxl_port member
+ * @ops: cxl root operations
+ */
+struct cxl_root {
+ struct cxl_port port;
+ const struct cxl_root_ops *ops;
+};
+
+static inline struct cxl_root *
+to_cxl_root(const struct cxl_port *port)
+{
+ return container_of(port, struct cxl_root, port);
+}
+
+struct cxl_root_ops {
+ int (*qos_class)(struct cxl_root *cxl_root,
+ struct access_coordinate *coord, int entries,
+ int *qos_class);
+};
+
+static inline struct cxl_dport *
+cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev)
+{
+ return xa_load(&port->dports, (unsigned long)dport_dev);
+}
+
+struct cxl_rcrb_info {
+ resource_size_t base;
+ u16 aer_cap;
+};
+
+/**
* struct cxl_dport - CXL downstream port
- * @dport: PCI bridge or firmware device representing the downstream link
+ * @dport_dev: PCI bridge or firmware device representing the downstream link
+ * @reg_map: component and ras register mapping parameters
* @port_id: unique hardware identifier for dport in decoder target list
- * @component_reg_phys: downstream port component registers
+ * @rcrb: Data about the Root Complex Register Block layout
+ * @rch: Indicate whether this dport was enumerated in RCH or VH mode
* @port: reference to cxl_port that contains this downstream port
- * @list: node for a cxl_port's list of cxl_dport instances
+ * @regs: Dport parsed register blocks
+ * @coord: access coordinates (bandwidth and latency performance attributes)
+ * @link_latency: calculated PCIe downstream latency
+ * @gpf_dvsec: Cached GPF port DVSEC
*/
struct cxl_dport {
- struct device *dport;
+ struct device *dport_dev;
+ struct cxl_register_map reg_map;
int port_id;
- resource_size_t component_reg_phys;
+ struct cxl_rcrb_info rcrb;
+ bool rch;
struct cxl_port *port;
- struct list_head list;
+ struct cxl_regs regs;
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ long link_latency;
+ int gpf_dvsec;
};
-struct cxl_port *to_cxl_port(struct device *dev);
-struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport,
- resource_size_t component_reg_phys,
- struct cxl_port *parent_port);
-
-int cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id,
- resource_size_t component_reg_phys);
+/**
+ * struct cxl_ep - track an endpoint's interest in a port
+ * @ep: device that hosts a generic CXL endpoint (expander or accelerator)
+ * @dport: which dport routes to this endpoint on @port
+ * @next: cxl switch port across the link attached to @dport NULL if
+ * attached to an endpoint
+ */
+struct cxl_ep {
+ struct device *ep;
+ struct cxl_dport *dport;
+ struct cxl_port *next;
+};
-struct cxl_decoder *to_cxl_decoder(struct device *dev);
-bool is_root_decoder(struct device *dev);
-struct cxl_decoder *
-devm_cxl_add_decoder(struct device *host, struct cxl_port *port, int nr_targets,
- resource_size_t base, resource_size_t len,
- int interleave_ways, int interleave_granularity,
- enum cxl_decoder_type type, unsigned long flags);
+/**
+ * struct cxl_region_ref - track a region's interest in a port
+ * @port: point in topology to install this reference
+ * @decoder: decoder assigned for @region in @port
+ * @region: region for this reference
+ * @endpoints: cxl_ep references for region members beneath @port
+ * @nr_targets_set: track how many targets have been programmed during setup
+ * @nr_eps: number of endpoints beneath @port
+ * @nr_targets: number of distinct targets needed to reach @nr_eps
+ */
+struct cxl_region_ref {
+ struct cxl_port *port;
+ struct cxl_decoder *decoder;
+ struct cxl_region *region;
+ struct xarray endpoints;
+ int nr_targets_set;
+ int nr_eps;
+ int nr_targets;
+};
/*
- * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure)
- * single ported host-bridges need not publish a decoder capability when a
- * passthrough decode can be assumed, i.e. all transactions that the uport sees
- * are claimed and passed to the single dport. Default the range a 0-base
- * 0-length until the first CXL region is activated.
+ * The platform firmware device hosting the root is also the top of the
+ * CXL port topology. All other CXL ports have another CXL port as their
+ * parent and their ->uport_dev / host device is out-of-line of the port
+ * ancestry.
*/
-static inline struct cxl_decoder *
-devm_cxl_add_passthrough_decoder(struct device *host, struct cxl_port *port)
+static inline bool is_cxl_root(struct cxl_port *port)
{
- return devm_cxl_add_decoder(host, port, 1, 0, 0, 1, PAGE_SIZE,
- CXL_DECODER_EXPANDER, 0);
+ return port->uport_dev == port->dev.parent;
}
-extern struct bus_type cxl_bus_type;
+/* Address translation functions exported to cxl_translate test module only */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos);
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig);
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig);
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig);
+struct cxl_cxims_data {
+ int nr_maps;
+ u64 xormaps[] __counted_by(nr_maps);
+};
+
+#if IS_ENABLED(CONFIG_CXL_ACPI)
+u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw);
+#else
+static inline u64 cxl_do_xormap_calc(struct cxl_cxims_data *cximsd, u64 addr, int hbiw)
+{
+ return ULLONG_MAX;
+}
+#endif
+
+int cxl_num_decoders_committed(struct cxl_port *port);
+bool is_cxl_port(const struct device *dev);
+struct cxl_port *to_cxl_port(const struct device *dev);
+struct cxl_port *parent_port_of(struct cxl_port *port);
+void cxl_port_commit_reap(struct cxl_decoder *cxld);
+struct pci_bus;
+int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
+ struct pci_bus *bus);
+struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port);
+struct cxl_port *devm_cxl_add_port(struct device *host,
+ struct device *uport_dev,
+ resource_size_t component_reg_phys,
+ struct cxl_dport *parent_dport);
+struct cxl_root *devm_cxl_add_root(struct device *host,
+ const struct cxl_root_ops *ops);
+struct cxl_root *find_cxl_root(struct cxl_port *port);
+
+DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev))
+DEFINE_FREE(put_cxl_port, struct cxl_port *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev))
+DEFINE_FREE(put_cxl_root_decoder, struct cxl_root_decoder *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->cxlsd.cxld.dev))
+DEFINE_FREE(put_cxl_region, struct cxl_region *, if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev))
+
+int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
+void cxl_bus_rescan(void);
+void cxl_bus_drain(void);
+struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev,
+ struct cxl_dport **dport);
+struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+ struct cxl_dport **dport);
+bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd);
+
+struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port,
+ struct device *dport, int port_id,
+ resource_size_t component_reg_phys);
+struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port,
+ struct device *dport_dev, int port_id,
+ resource_size_t rcrb);
+
+#ifdef CONFIG_PCIEAER_CXL
+void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport);
+void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host);
+#else
+static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport,
+ struct device *host) { }
+#endif
+
+struct cxl_decoder *to_cxl_decoder(struct device *dev);
+struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev);
+struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev);
+struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev);
+bool is_root_decoder(struct device *dev);
+bool is_switch_decoder(struct device *dev);
+bool is_endpoint_decoder(struct device *dev);
+struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port,
+ unsigned int nr_targets);
+struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port,
+ unsigned int nr_targets);
+int cxl_decoder_add(struct cxl_decoder *cxld);
+struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port);
+int cxl_decoder_add_locked(struct cxl_decoder *cxld);
+int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld);
+static inline int cxl_root_decoder_autoremove(struct device *host,
+ struct cxl_root_decoder *cxlrd)
+{
+ return cxl_decoder_autoremove(host, &cxlrd->cxlsd.cxld);
+}
+int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint);
+
+/**
+ * struct cxl_endpoint_dvsec_info - Cached DVSEC info
+ * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
+ * @ranges: Number of active HDM ranges this device uses.
+ * @port: endpoint port associated with this info instance
+ * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE
+ */
+struct cxl_endpoint_dvsec_info {
+ bool mem_enabled;
+ int ranges;
+ struct cxl_port *port;
+ struct range dvsec_range[2];
+};
+
+int devm_cxl_switch_port_decoders_setup(struct cxl_port *port);
+int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port);
+int devm_cxl_endpoint_decoders_setup(struct cxl_port *port);
+
+struct cxl_dev_state;
+int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds,
+ struct cxl_endpoint_dvsec_info *info);
+
+bool is_cxl_region(struct device *dev);
+
+extern const struct bus_type cxl_bus_type;
struct cxl_driver {
const char *name;
@@ -302,18 +867,25 @@ struct cxl_driver {
int id;
};
-static inline struct cxl_driver *to_cxl_drv(struct device_driver *drv)
-{
- return container_of(drv, struct cxl_driver, drv);
-}
+#define to_cxl_drv(__drv) container_of_const(__drv, struct cxl_driver, drv)
int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner,
const char *modname);
#define cxl_driver_register(x) __cxl_driver_register(x, THIS_MODULE, KBUILD_MODNAME)
void cxl_driver_unregister(struct cxl_driver *cxl_drv);
+#define module_cxl_driver(__cxl_driver) \
+ module_driver(__cxl_driver, cxl_driver_register, cxl_driver_unregister)
+
#define CXL_DEVICE_NVDIMM_BRIDGE 1
#define CXL_DEVICE_NVDIMM 2
+#define CXL_DEVICE_PORT 3
+#define CXL_DEVICE_ROOT 4
+#define CXL_DEVICE_MEMORY_EXPANDER 5
+#define CXL_DEVICE_REGION 6
+#define CXL_DEVICE_PMEM_REGION 7
+#define CXL_DEVICE_DAX_REGION 8
+#define CXL_DEVICE_PMU 9
#define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*")
#define CXL_MODALIAS_FMT "cxl:t%d"
@@ -323,5 +895,84 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host,
struct cxl_port *port);
struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev);
bool is_cxl_nvdimm(struct device *dev);
-int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd);
+int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct cxl_memdev *cxlmd);
+struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port);
+
+#ifdef CONFIG_CXL_REGION
+bool is_cxl_pmem_region(struct device *dev);
+struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev);
+int cxl_add_to_region(struct cxl_endpoint_decoder *cxled);
+struct cxl_dax_region *to_cxl_dax_region(struct device *dev);
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa);
+#else
+static inline bool is_cxl_pmem_region(struct device *dev)
+{
+ return false;
+}
+static inline struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
+{
+ return NULL;
+}
+static inline int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
+{
+ return 0;
+}
+static inline struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
+{
+ return NULL;
+}
+static inline u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint,
+ u64 spa)
+{
+ return 0;
+}
+#endif
+
+void cxl_endpoint_parse_cdat(struct cxl_port *port);
+void cxl_switch_parse_cdat(struct cxl_dport *dport);
+
+int cxl_endpoint_get_perf_coordinates(struct cxl_port *port,
+ struct access_coordinate *coord);
+void cxl_region_perf_data_calculate(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled);
+void cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr);
+
+void cxl_memdev_update_perf(struct cxl_memdev *cxlmd);
+
+void cxl_coordinates_combine(struct access_coordinate *out,
+ struct access_coordinate *c1,
+ struct access_coordinate *c2);
+
+bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
+struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port,
+ struct device *dport_dev);
+struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port,
+ struct device *dport_dev);
+
+/*
+ * Unit test builds overrides this to __weak, find the 'strong' version
+ * of these symbols in tools/testing/cxl/.
+ */
+#ifndef __mock
+#define __mock static
+#endif
+
+u16 cxl_gpf_get_dvsec(struct device *dev);
+
+/*
+ * Declaration for functions that are mocked by cxl_test that are called by
+ * cxl_core. The respective functions are defined as __foo() and called by
+ * cxl_core as foo(). The macros below ensures that those functions would
+ * exist as foo(). See tools/testing/cxl/cxl_core_exports.c and
+ * tools/testing/cxl/exports.h for setting up the mock functions. The dance
+ * is done to avoid a circular dependency where cxl_core calls a function that
+ * ends up being a mock function and goes to * cxl_test where it calls a
+ * cxl_core function.
+ */
+#ifndef CXL_TEST_ENABLE
+#define DECLARE_TESTABLE(x) __##x
+#define devm_cxl_add_dport_by_dev DECLARE_TESTABLE(devm_cxl_add_dport_by_dev)
+#define devm_cxl_switch_port_decoders_setup DECLARE_TESTABLE(devm_cxl_switch_port_decoders_setup)
+#endif
+
#endif /* __CXL_H__ */
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
new file mode 100644
index 000000000000..434031a0c1f7
--- /dev/null
+++ b/drivers/cxl/cxlmem.h
@@ -0,0 +1,931 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020-2021 Intel Corporation. */
+#ifndef __CXL_MEM_H__
+#define __CXL_MEM_H__
+#include <uapi/linux/cxl_mem.h>
+#include <linux/pci.h>
+#include <linux/cdev.h>
+#include <linux/uuid.h>
+#include <linux/node.h>
+#include <cxl/event.h>
+#include <cxl/mailbox.h>
+#include "cxl.h"
+
+/* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
+#define CXLMDEV_STATUS_OFFSET 0x0
+#define CXLMDEV_DEV_FATAL BIT(0)
+#define CXLMDEV_FW_HALT BIT(1)
+#define CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
+#define CXLMDEV_MS_NOT_READY 0
+#define CXLMDEV_MS_READY 1
+#define CXLMDEV_MS_ERROR 2
+#define CXLMDEV_MS_DISABLED 3
+#define CXLMDEV_READY(status) \
+ (FIELD_GET(CXLMDEV_STATUS_MEDIA_STATUS_MASK, status) == \
+ CXLMDEV_MS_READY)
+#define CXLMDEV_MBOX_IF_READY BIT(4)
+#define CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
+#define CXLMDEV_RESET_NEEDED_NOT 0
+#define CXLMDEV_RESET_NEEDED_COLD 1
+#define CXLMDEV_RESET_NEEDED_WARM 2
+#define CXLMDEV_RESET_NEEDED_HOT 3
+#define CXLMDEV_RESET_NEEDED_CXL 4
+#define CXLMDEV_RESET_NEEDED(status) \
+ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \
+ CXLMDEV_RESET_NEEDED_NOT)
+
+/**
+ * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
+ * @dev: driver core device object
+ * @cdev: char dev core object for ioctl operations
+ * @cxlds: The device state backing this device
+ * @detach_work: active memdev lost a port in its ancestry
+ * @cxl_nvb: coordinate removal of @cxl_nvd if present
+ * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem
+ * @endpoint: connection to the CXL port topology for this memory device
+ * @id: id number of this memdev instance.
+ * @depth: endpoint port depth
+ * @scrub_cycle: current scrub cycle set for this device
+ * @scrub_region_id: id number of a backed region (if any) for which current scrub cycle set
+ * @err_rec_array: List of xarrarys to store the memdev error records to
+ * check attributes for a memory repair operation are from
+ * current boot.
+ */
+struct cxl_memdev {
+ struct device dev;
+ struct cdev cdev;
+ struct cxl_dev_state *cxlds;
+ struct work_struct detach_work;
+ struct cxl_nvdimm_bridge *cxl_nvb;
+ struct cxl_nvdimm *cxl_nvd;
+ struct cxl_port *endpoint;
+ int id;
+ int depth;
+ u8 scrub_cycle;
+ int scrub_region_id;
+ void *err_rec_array;
+};
+
+static inline struct cxl_memdev *to_cxl_memdev(struct device *dev)
+{
+ return container_of(dev, struct cxl_memdev, dev);
+}
+
+static inline struct cxl_port *cxled_to_port(struct cxl_endpoint_decoder *cxled)
+{
+ return to_cxl_port(cxled->cxld.dev.parent);
+}
+
+static inline struct cxl_port *cxlrd_to_port(struct cxl_root_decoder *cxlrd)
+{
+ return to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+}
+
+static inline struct cxl_memdev *
+cxled_to_memdev(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_port *port = to_cxl_port(cxled->cxld.dev.parent);
+
+ return to_cxl_memdev(port->uport_dev);
+}
+
+bool is_cxl_memdev(const struct device *dev);
+static inline bool is_cxl_endpoint(struct cxl_port *port)
+{
+ return is_cxl_memdev(port->uport_dev);
+}
+
+struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
+ struct cxl_dev_state *cxlds);
+int devm_cxl_sanitize_setup_notifier(struct device *host,
+ struct cxl_memdev *cxlmd);
+struct cxl_memdev_state;
+int devm_cxl_setup_fw_upload(struct device *host, struct cxl_memdev_state *mds);
+int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
+ resource_size_t base, resource_size_t len,
+ resource_size_t skipped);
+
+#define CXL_NR_PARTITIONS_MAX 2
+
+struct cxl_dpa_info {
+ u64 size;
+ struct cxl_dpa_part_info {
+ struct range range;
+ enum cxl_partition_mode mode;
+ } part[CXL_NR_PARTITIONS_MAX];
+ int nr_partitions;
+};
+
+int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info);
+
+static inline struct cxl_ep *cxl_ep_load(struct cxl_port *port,
+ struct cxl_memdev *cxlmd)
+{
+ if (!port)
+ return NULL;
+
+ return xa_load(&port->endpoints, (unsigned long)&cxlmd->dev);
+}
+
+/*
+ * Per CXL 3.0 Section 8.2.8.4.5.1
+ */
+#define CMD_CMD_RC_TABLE \
+ C(SUCCESS, 0, NULL), \
+ C(BACKGROUND, -ENXIO, "background cmd started successfully"), \
+ C(INPUT, -ENXIO, "cmd input was invalid"), \
+ C(UNSUPPORTED, -ENXIO, "cmd is not supported"), \
+ C(INTERNAL, -ENXIO, "internal device error"), \
+ C(RETRY, -ENXIO, "temporary error, retry once"), \
+ C(BUSY, -ENXIO, "ongoing background operation"), \
+ C(MEDIADISABLED, -ENXIO, "media access is disabled"), \
+ C(FWINPROGRESS, -ENXIO, "one FW package can be transferred at a time"), \
+ C(FWOOO, -ENXIO, "FW package content was transferred out of order"), \
+ C(FWAUTH, -ENXIO, "FW package authentication failed"), \
+ C(FWSLOT, -ENXIO, "FW slot is not supported for requested operation"), \
+ C(FWROLLBACK, -ENXIO, "rolled back to the previous active FW"), \
+ C(FWRESET, -ENXIO, "FW failed to activate, needs cold reset"), \
+ C(HANDLE, -ENXIO, "one or more Event Record Handles were invalid"), \
+ C(PADDR, -EFAULT, "physical address specified is invalid"), \
+ C(POISONLMT, -EBUSY, "poison injection limit has been reached"), \
+ C(MEDIAFAILURE, -ENXIO, "permanent issue with the media"), \
+ C(ABORT, -ENXIO, "background cmd was aborted by device"), \
+ C(SECURITY, -ENXIO, "not valid in the current security state"), \
+ C(PASSPHRASE, -ENXIO, "phrase doesn't match current set passphrase"), \
+ C(MBUNSUPPORTED, -ENXIO, "unsupported on the mailbox it was issued on"),\
+ C(PAYLOADLEN, -ENXIO, "invalid payload length"), \
+ C(LOG, -ENXIO, "invalid or unsupported log page"), \
+ C(INTERRUPTED, -ENXIO, "asynchronous event occured"), \
+ C(FEATUREVERSION, -ENXIO, "unsupported feature version"), \
+ C(FEATURESELVALUE, -ENXIO, "unsupported feature selection value"), \
+ C(FEATURETRANSFERIP, -ENXIO, "feature transfer in progress"), \
+ C(FEATURETRANSFEROOO, -ENXIO, "feature transfer out of order"), \
+ C(RESOURCEEXHAUSTED, -ENXIO, "resources are exhausted"), \
+ C(EXTLIST, -ENXIO, "invalid Extent List"), \
+
+#undef C
+#define C(a, b, c) CXL_MBOX_CMD_RC_##a
+enum { CMD_CMD_RC_TABLE };
+#undef C
+#define C(a, b, c) { b, c }
+struct cxl_mbox_cmd_rc {
+ int err;
+ const char *desc;
+};
+
+static const
+struct cxl_mbox_cmd_rc cxl_mbox_cmd_rctable[] ={ CMD_CMD_RC_TABLE };
+#undef C
+
+static inline const char *cxl_mbox_cmd_rc2str(struct cxl_mbox_cmd *mbox_cmd)
+{
+ return cxl_mbox_cmd_rctable[mbox_cmd->return_code].desc;
+}
+
+static inline int cxl_mbox_cmd_rc2errno(struct cxl_mbox_cmd *mbox_cmd)
+{
+ return cxl_mbox_cmd_rctable[mbox_cmd->return_code].err;
+}
+
+/*
+ * CXL 2.0 - Memory capacity multiplier
+ * See Section 8.2.9.5
+ *
+ * Volatile, Persistent, and Partition capacities are specified to be in
+ * multiples of 256MB - define a multiplier to convert to/from bytes.
+ */
+#define CXL_CAPACITY_MULTIPLIER SZ_256M
+
+/*
+ * Event Interrupt Policy
+ *
+ * CXL rev 3.0 section 8.2.9.2.4; Table 8-52
+ */
+enum cxl_event_int_mode {
+ CXL_INT_NONE = 0x00,
+ CXL_INT_MSI_MSIX = 0x01,
+ CXL_INT_FW = 0x02
+};
+struct cxl_event_interrupt_policy {
+ u8 info_settings;
+ u8 warn_settings;
+ u8 failure_settings;
+ u8 fatal_settings;
+} __packed;
+
+/**
+ * struct cxl_event_state - Event log driver state
+ *
+ * @buf: Buffer to receive event data
+ * @log_lock: Serialize event_buf and log use
+ */
+struct cxl_event_state {
+ struct cxl_get_event_payload *buf;
+ struct mutex log_lock;
+};
+
+/* Device enabled poison commands */
+enum poison_cmd_enabled_bits {
+ CXL_POISON_ENABLED_LIST,
+ CXL_POISON_ENABLED_INJECT,
+ CXL_POISON_ENABLED_CLEAR,
+ CXL_POISON_ENABLED_SCAN_CAPS,
+ CXL_POISON_ENABLED_SCAN_MEDIA,
+ CXL_POISON_ENABLED_SCAN_RESULTS,
+ CXL_POISON_ENABLED_MAX
+};
+
+/* Device enabled security commands */
+enum security_cmd_enabled_bits {
+ CXL_SEC_ENABLED_SANITIZE,
+ CXL_SEC_ENABLED_SECURE_ERASE,
+ CXL_SEC_ENABLED_GET_SECURITY_STATE,
+ CXL_SEC_ENABLED_SET_PASSPHRASE,
+ CXL_SEC_ENABLED_DISABLE_PASSPHRASE,
+ CXL_SEC_ENABLED_UNLOCK,
+ CXL_SEC_ENABLED_FREEZE_SECURITY,
+ CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE,
+ CXL_SEC_ENABLED_MAX
+};
+
+/**
+ * struct cxl_poison_state - Driver poison state info
+ *
+ * @max_errors: Maximum media error records held in device cache
+ * @enabled_cmds: All poison commands enabled in the CEL
+ * @list_out: The poison list payload returned by device
+ * @mutex: Protect reads of the poison list
+ *
+ * Reads of the poison list are synchronized to ensure that a reader
+ * does not get an incomplete list because their request overlapped
+ * (was interrupted or preceded by) another read request of the same
+ * DPA range. CXL Spec 3.0 Section 8.2.9.8.4.1
+ */
+struct cxl_poison_state {
+ u32 max_errors;
+ DECLARE_BITMAP(enabled_cmds, CXL_POISON_ENABLED_MAX);
+ struct cxl_mbox_poison_out *list_out;
+ struct mutex mutex; /* Protect reads of poison list */
+};
+
+/*
+ * Get FW Info
+ * CXL rev 3.0 section 8.2.9.3.1; Table 8-56
+ */
+struct cxl_mbox_get_fw_info {
+ u8 num_slots;
+ u8 slot_info;
+ u8 activation_cap;
+ u8 reserved[13];
+ char slot_1_revision[16];
+ char slot_2_revision[16];
+ char slot_3_revision[16];
+ char slot_4_revision[16];
+} __packed;
+
+#define CXL_FW_INFO_SLOT_INFO_CUR_MASK GENMASK(2, 0)
+#define CXL_FW_INFO_SLOT_INFO_NEXT_MASK GENMASK(5, 3)
+#define CXL_FW_INFO_SLOT_INFO_NEXT_SHIFT 3
+#define CXL_FW_INFO_ACTIVATION_CAP_HAS_LIVE_ACTIVATE BIT(0)
+
+/*
+ * Transfer FW Input Payload
+ * CXL rev 3.0 section 8.2.9.3.2; Table 8-57
+ */
+struct cxl_mbox_transfer_fw {
+ u8 action;
+ u8 slot;
+ u8 reserved[2];
+ __le32 offset;
+ u8 reserved2[0x78];
+ u8 data[];
+} __packed;
+
+#define CXL_FW_TRANSFER_ACTION_FULL 0x0
+#define CXL_FW_TRANSFER_ACTION_INITIATE 0x1
+#define CXL_FW_TRANSFER_ACTION_CONTINUE 0x2
+#define CXL_FW_TRANSFER_ACTION_END 0x3
+#define CXL_FW_TRANSFER_ACTION_ABORT 0x4
+
+/*
+ * CXL rev 3.0 section 8.2.9.3.2 mandates 128-byte alignment for FW packages
+ * and for each part transferred in a Transfer FW command.
+ */
+#define CXL_FW_TRANSFER_ALIGNMENT 128
+
+/*
+ * Activate FW Input Payload
+ * CXL rev 3.0 section 8.2.9.3.3; Table 8-58
+ */
+struct cxl_mbox_activate_fw {
+ u8 action;
+ u8 slot;
+} __packed;
+
+#define CXL_FW_ACTIVATE_ONLINE 0x0
+#define CXL_FW_ACTIVATE_OFFLINE 0x1
+
+/* FW state bits */
+#define CXL_FW_STATE_BITS 32
+#define CXL_FW_CANCEL 0
+
+/**
+ * struct cxl_fw_state - Firmware upload / activation state
+ *
+ * @state: fw_uploader state bitmask
+ * @oneshot: whether the fw upload fits in a single transfer
+ * @num_slots: Number of FW slots available
+ * @cur_slot: Slot number currently active
+ * @next_slot: Slot number for the new firmware
+ */
+struct cxl_fw_state {
+ DECLARE_BITMAP(state, CXL_FW_STATE_BITS);
+ bool oneshot;
+ int num_slots;
+ int cur_slot;
+ int next_slot;
+};
+
+/**
+ * struct cxl_security_state - Device security state
+ *
+ * @state: state of last security operation
+ * @enabled_cmds: All security commands enabled in the CEL
+ * @poll_tmo_secs: polling timeout
+ * @sanitize_active: sanitize completion pending
+ * @poll_dwork: polling work item
+ * @sanitize_node: sanitation sysfs file to notify
+ */
+struct cxl_security_state {
+ unsigned long state;
+ DECLARE_BITMAP(enabled_cmds, CXL_SEC_ENABLED_MAX);
+ int poll_tmo_secs;
+ bool sanitize_active;
+ struct delayed_work poll_dwork;
+ struct kernfs_node *sanitize_node;
+};
+
+/*
+ * enum cxl_devtype - delineate type-2 from a generic type-3 device
+ * @CXL_DEVTYPE_DEVMEM - Vendor specific CXL Type-2 device implementing HDM-D or
+ * HDM-DB, no requirement that this device implements a
+ * mailbox, or other memory-device-standard manageability
+ * flows.
+ * @CXL_DEVTYPE_CLASSMEM - Common class definition of a CXL Type-3 device with
+ * HDM-H and class-mandatory memory device registers
+ */
+enum cxl_devtype {
+ CXL_DEVTYPE_DEVMEM,
+ CXL_DEVTYPE_CLASSMEM,
+};
+
+/**
+ * struct cxl_dpa_perf - DPA performance property entry
+ * @dpa_range: range for DPA address
+ * @coord: QoS performance data (i.e. latency, bandwidth)
+ * @cdat_coord: raw QoS performance data from CDAT
+ * @qos_class: QoS Class cookies
+ */
+struct cxl_dpa_perf {
+ struct range dpa_range;
+ struct access_coordinate coord[ACCESS_COORDINATE_MAX];
+ struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX];
+ int qos_class;
+};
+
+/**
+ * struct cxl_dpa_partition - DPA partition descriptor
+ * @res: shortcut to the partition in the DPA resource tree (cxlds->dpa_res)
+ * @perf: performance attributes of the partition from CDAT
+ * @mode: operation mode for the DPA capacity, e.g. ram, pmem, dynamic...
+ */
+struct cxl_dpa_partition {
+ struct resource res;
+ struct cxl_dpa_perf perf;
+ enum cxl_partition_mode mode;
+};
+
+/**
+ * struct cxl_dev_state - The driver device state
+ *
+ * cxl_dev_state represents the CXL driver/device state. It provides an
+ * interface to mailbox commands as well as some cached data about the device.
+ * Currently only memory devices are represented.
+ *
+ * @dev: The device associated with this CXL state
+ * @cxlmd: The device representing the CXL.mem capabilities of @dev
+ * @reg_map: component and ras register mapping parameters
+ * @regs: Parsed register blocks
+ * @cxl_dvsec: Offset to the PCIe device DVSEC
+ * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH)
+ * @media_ready: Indicate whether the device media is usable
+ * @dpa_res: Overall DPA resource tree for the device
+ * @part: DPA partition array
+ * @nr_partitions: Number of DPA partitions
+ * @serial: PCIe Device Serial Number
+ * @type: Generic Memory Class device or Vendor Specific Memory device
+ * @cxl_mbox: CXL mailbox context
+ * @cxlfs: CXL features context
+ */
+struct cxl_dev_state {
+ struct device *dev;
+ struct cxl_memdev *cxlmd;
+ struct cxl_register_map reg_map;
+ struct cxl_regs regs;
+ int cxl_dvsec;
+ bool rcd;
+ bool media_ready;
+ struct resource dpa_res;
+ struct cxl_dpa_partition part[CXL_NR_PARTITIONS_MAX];
+ unsigned int nr_partitions;
+ u64 serial;
+ enum cxl_devtype type;
+ struct cxl_mailbox cxl_mbox;
+#ifdef CONFIG_CXL_FEATURES
+ struct cxl_features_state *cxlfs;
+#endif
+};
+
+static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
+{
+ /*
+ * Static PMEM may be at partition index 0 when there is no static RAM
+ * capacity.
+ */
+ for (int i = 0; i < cxlds->nr_partitions; i++)
+ if (cxlds->part[i].mode == CXL_PARTMODE_PMEM)
+ return resource_size(&cxlds->part[i].res);
+ return 0;
+}
+
+static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
+{
+ return dev_get_drvdata(cxl_mbox->host);
+}
+
+/**
+ * struct cxl_memdev_state - Generic Type-3 Memory Device Class driver data
+ *
+ * CXL 8.1.12.1 PCI Header - Class Code Register Memory Device defines
+ * common memory device functionality like the presence of a mailbox and
+ * the functionality related to that like Identify Memory Device and Get
+ * Partition Info
+ * @cxlds: Core driver state common across Type-2 and Type-3 devices
+ * @lsa_size: Size of Label Storage Area
+ * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
+ * @firmware_version: Firmware version for the memory device.
+ * @total_bytes: sum of all possible capacities
+ * @volatile_only_bytes: hard volatile capacity
+ * @persistent_only_bytes: hard persistent capacity
+ * @partition_align_bytes: alignment size for partition-able capacity
+ * @active_volatile_bytes: sum of hard + soft volatile
+ * @active_persistent_bytes: sum of hard + soft persistent
+ * @event: event log driver state
+ * @poison: poison driver state info
+ * @security: security driver state info
+ * @fw: firmware upload / activation state
+ * @mce_notifier: MCE notifier
+ *
+ * See CXL 3.0 8.2.9.8.2 Capacity Configuration and Label Storage for
+ * details on capacity parameters.
+ */
+struct cxl_memdev_state {
+ struct cxl_dev_state cxlds;
+ size_t lsa_size;
+ char firmware_version[0x10];
+ u64 total_bytes;
+ u64 volatile_only_bytes;
+ u64 persistent_only_bytes;
+ u64 partition_align_bytes;
+ u64 active_volatile_bytes;
+ u64 active_persistent_bytes;
+
+ struct cxl_event_state event;
+ struct cxl_poison_state poison;
+ struct cxl_security_state security;
+ struct cxl_fw_state fw;
+ struct notifier_block mce_notifier;
+};
+
+static inline struct cxl_memdev_state *
+to_cxl_memdev_state(struct cxl_dev_state *cxlds)
+{
+ if (cxlds->type != CXL_DEVTYPE_CLASSMEM)
+ return NULL;
+ return container_of(cxlds, struct cxl_memdev_state, cxlds);
+}
+
+enum cxl_opcode {
+ CXL_MBOX_OP_INVALID = 0x0000,
+ CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID,
+ CXL_MBOX_OP_GET_EVENT_RECORD = 0x0100,
+ CXL_MBOX_OP_CLEAR_EVENT_RECORD = 0x0101,
+ CXL_MBOX_OP_GET_EVT_INT_POLICY = 0x0102,
+ CXL_MBOX_OP_SET_EVT_INT_POLICY = 0x0103,
+ CXL_MBOX_OP_GET_FW_INFO = 0x0200,
+ CXL_MBOX_OP_TRANSFER_FW = 0x0201,
+ CXL_MBOX_OP_ACTIVATE_FW = 0x0202,
+ CXL_MBOX_OP_GET_TIMESTAMP = 0x0300,
+ CXL_MBOX_OP_SET_TIMESTAMP = 0x0301,
+ CXL_MBOX_OP_GET_SUPPORTED_LOGS = 0x0400,
+ CXL_MBOX_OP_GET_LOG = 0x0401,
+ CXL_MBOX_OP_GET_LOG_CAPS = 0x0402,
+ CXL_MBOX_OP_CLEAR_LOG = 0x0403,
+ CXL_MBOX_OP_GET_SUP_LOG_SUBLIST = 0x0405,
+ CXL_MBOX_OP_GET_SUPPORTED_FEATURES = 0x0500,
+ CXL_MBOX_OP_GET_FEATURE = 0x0501,
+ CXL_MBOX_OP_SET_FEATURE = 0x0502,
+ CXL_MBOX_OP_DO_MAINTENANCE = 0x0600,
+ CXL_MBOX_OP_IDENTIFY = 0x4000,
+ CXL_MBOX_OP_GET_PARTITION_INFO = 0x4100,
+ CXL_MBOX_OP_SET_PARTITION_INFO = 0x4101,
+ CXL_MBOX_OP_GET_LSA = 0x4102,
+ CXL_MBOX_OP_SET_LSA = 0x4103,
+ CXL_MBOX_OP_GET_HEALTH_INFO = 0x4200,
+ CXL_MBOX_OP_GET_ALERT_CONFIG = 0x4201,
+ CXL_MBOX_OP_SET_ALERT_CONFIG = 0x4202,
+ CXL_MBOX_OP_GET_SHUTDOWN_STATE = 0x4203,
+ CXL_MBOX_OP_SET_SHUTDOWN_STATE = 0x4204,
+ CXL_MBOX_OP_GET_POISON = 0x4300,
+ CXL_MBOX_OP_INJECT_POISON = 0x4301,
+ CXL_MBOX_OP_CLEAR_POISON = 0x4302,
+ CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS = 0x4303,
+ CXL_MBOX_OP_SCAN_MEDIA = 0x4304,
+ CXL_MBOX_OP_GET_SCAN_MEDIA = 0x4305,
+ CXL_MBOX_OP_SANITIZE = 0x4400,
+ CXL_MBOX_OP_SECURE_ERASE = 0x4401,
+ CXL_MBOX_OP_GET_SECURITY_STATE = 0x4500,
+ CXL_MBOX_OP_SET_PASSPHRASE = 0x4501,
+ CXL_MBOX_OP_DISABLE_PASSPHRASE = 0x4502,
+ CXL_MBOX_OP_UNLOCK = 0x4503,
+ CXL_MBOX_OP_FREEZE_SECURITY = 0x4504,
+ CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE = 0x4505,
+ CXL_MBOX_OP_MAX = 0x10000
+};
+
+#define DEFINE_CXL_CEL_UUID \
+ UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96, 0xb1, 0x62, \
+ 0x3b, 0x3f, 0x17)
+
+#define DEFINE_CXL_VENDOR_DEBUG_UUID \
+ UUID_INIT(0x5e1819d9, 0x11a9, 0x400c, 0x81, 0x1f, 0xd6, 0x07, 0x19, \
+ 0x40, 0x3d, 0x86)
+
+struct cxl_mbox_get_supported_logs {
+ __le16 entries;
+ u8 rsvd[6];
+ struct cxl_gsl_entry {
+ uuid_t uuid;
+ __le32 size;
+ } __packed entry[];
+} __packed;
+
+struct cxl_cel_entry {
+ __le16 opcode;
+ __le16 effect;
+} __packed;
+
+struct cxl_mbox_get_log {
+ uuid_t uuid;
+ __le32 offset;
+ __le32 length;
+} __packed;
+
+/* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
+struct cxl_mbox_identify {
+ char fw_revision[0x10];
+ __le64 total_capacity;
+ __le64 volatile_capacity;
+ __le64 persistent_capacity;
+ __le64 partition_align;
+ __le16 info_event_log_size;
+ __le16 warning_event_log_size;
+ __le16 failure_event_log_size;
+ __le16 fatal_event_log_size;
+ __le32 lsa_size;
+ u8 poison_list_max_mer[3];
+ __le16 inject_poison_limit;
+ u8 poison_caps;
+ u8 qos_telemetry_caps;
+} __packed;
+
+/*
+ * General Media Event Record UUID
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CXL_EVENT_GEN_MEDIA_UUID \
+ UUID_INIT(0xfbcd0a77, 0xc260, 0x417f, 0x85, 0xa9, 0x08, 0x8b, 0x16, \
+ 0x21, 0xeb, 0xa6)
+
+/*
+ * DRAM Event Record UUID
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CXL_EVENT_DRAM_UUID \
+ UUID_INIT(0x601dcbb3, 0x9c06, 0x4eab, 0xb8, 0xaf, 0x4e, 0x9b, 0xfb, \
+ 0x5c, 0x96, 0x24)
+
+/*
+ * Memory Module Event Record UUID
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CXL_EVENT_MEM_MODULE_UUID \
+ UUID_INIT(0xfe927475, 0xdd59, 0x4339, 0xa5, 0x86, 0x79, 0xba, 0xb1, \
+ 0x13, 0xb7, 0x74)
+
+/*
+ * Memory Sparing Event Record UUID
+ * CXL rev 3.2 section 8.2.10.2.1.4: Table 8-60
+ */
+#define CXL_EVENT_MEM_SPARING_UUID \
+ UUID_INIT(0xe71f3a40, 0x2d29, 0x4092, 0x8a, 0x39, 0x4d, 0x1c, 0x96, \
+ 0x6c, 0x7c, 0x65)
+
+/*
+ * Get Event Records output payload
+ * CXL rev 3.0 section 8.2.9.2.2; Table 8-50
+ */
+#define CXL_GET_EVENT_FLAG_OVERFLOW BIT(0)
+#define CXL_GET_EVENT_FLAG_MORE_RECORDS BIT(1)
+struct cxl_get_event_payload {
+ u8 flags;
+ u8 reserved1;
+ __le16 overflow_err_count;
+ __le64 first_overflow_timestamp;
+ __le64 last_overflow_timestamp;
+ __le16 record_count;
+ u8 reserved2[10];
+ struct cxl_event_record_raw records[];
+} __packed;
+
+/*
+ * CXL rev 3.0 section 8.2.9.2.2; Table 8-49
+ */
+enum cxl_event_log_type {
+ CXL_EVENT_TYPE_INFO = 0x00,
+ CXL_EVENT_TYPE_WARN,
+ CXL_EVENT_TYPE_FAIL,
+ CXL_EVENT_TYPE_FATAL,
+ CXL_EVENT_TYPE_MAX
+};
+
+/*
+ * Clear Event Records input payload
+ * CXL rev 3.0 section 8.2.9.2.3; Table 8-51
+ */
+struct cxl_mbox_clear_event_payload {
+ u8 event_log; /* enum cxl_event_log_type */
+ u8 clear_flags;
+ u8 nr_recs;
+ u8 reserved[3];
+ __le16 handles[];
+} __packed;
+#define CXL_CLEAR_EVENT_MAX_HANDLES U8_MAX
+
+struct cxl_mbox_get_partition_info {
+ __le64 active_volatile_cap;
+ __le64 active_persistent_cap;
+ __le64 next_volatile_cap;
+ __le64 next_persistent_cap;
+} __packed;
+
+struct cxl_mbox_get_lsa {
+ __le32 offset;
+ __le32 length;
+} __packed;
+
+struct cxl_mbox_set_lsa {
+ __le32 offset;
+ __le32 reserved;
+ u8 data[];
+} __packed;
+
+struct cxl_mbox_set_partition_info {
+ __le64 volatile_capacity;
+ u8 flags;
+} __packed;
+
+#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0)
+
+/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */
+struct cxl_mbox_get_health_info_out {
+ u8 health_status;
+ u8 media_status;
+ u8 additional_status;
+ u8 life_used;
+ __le16 device_temperature;
+ __le32 dirty_shutdown_cnt;
+ __le32 corrected_volatile_error_cnt;
+ __le32 corrected_persistent_error_cnt;
+} __packed;
+
+/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */
+struct cxl_mbox_set_shutdown_state_in {
+ u8 state;
+} __packed;
+
+/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
+struct cxl_mbox_set_timestamp_in {
+ __le64 timestamp;
+
+} __packed;
+
+/* Get Poison List CXL 3.0 Spec 8.2.9.8.4.1 */
+struct cxl_mbox_poison_in {
+ __le64 offset;
+ __le64 length;
+} __packed;
+
+struct cxl_mbox_poison_out {
+ u8 flags;
+ u8 rsvd1;
+ __le64 overflow_ts;
+ __le16 count;
+ u8 rsvd2[20];
+ struct cxl_poison_record {
+ __le64 address;
+ __le32 length;
+ __le32 rsvd;
+ } __packed record[];
+} __packed;
+
+/*
+ * Get Poison List address field encodes the starting
+ * address of poison, and the source of the poison.
+ */
+#define CXL_POISON_START_MASK GENMASK_ULL(63, 6)
+#define CXL_POISON_SOURCE_MASK GENMASK(2, 0)
+
+/* Get Poison List record length is in units of 64 bytes */
+#define CXL_POISON_LEN_MULT 64
+
+/* Kernel defined maximum for a list of poison errors */
+#define CXL_POISON_LIST_MAX 1024
+
+/* Get Poison List: Payload out flags */
+#define CXL_POISON_FLAG_MORE BIT(0)
+#define CXL_POISON_FLAG_OVERFLOW BIT(1)
+#define CXL_POISON_FLAG_SCANNING BIT(2)
+
+/* Get Poison List: Poison Source */
+#define CXL_POISON_SOURCE_UNKNOWN 0
+#define CXL_POISON_SOURCE_EXTERNAL 1
+#define CXL_POISON_SOURCE_INTERNAL 2
+#define CXL_POISON_SOURCE_INJECTED 3
+#define CXL_POISON_SOURCE_VENDOR 7
+
+/* Inject & Clear Poison CXL 3.0 Spec 8.2.9.8.4.2/3 */
+struct cxl_mbox_inject_poison {
+ __le64 address;
+};
+
+/* Clear Poison CXL 3.0 Spec 8.2.9.8.4.3 */
+struct cxl_mbox_clear_poison {
+ __le64 address;
+ u8 write_data[CXL_POISON_LEN_MULT];
+} __packed;
+
+/**
+ * struct cxl_mem_command - Driver representation of a memory device command
+ * @info: Command information as it exists for the UAPI
+ * @opcode: The actual bits used for the mailbox protocol
+ * @flags: Set of flags effecting driver behavior.
+ *
+ * * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
+ * will be enabled by the driver regardless of what hardware may have
+ * advertised.
+ *
+ * The cxl_mem_command is the driver's internal representation of commands that
+ * are supported by the driver. Some of these commands may not be supported by
+ * the hardware. The driver will use @info to validate the fields passed in by
+ * the user then submit the @opcode to the hardware.
+ *
+ * See struct cxl_command_info.
+ */
+struct cxl_mem_command {
+ struct cxl_command_info info;
+ enum cxl_opcode opcode;
+ u32 flags;
+#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
+};
+
+#define CXL_PMEM_SEC_STATE_USER_PASS_SET 0x01
+#define CXL_PMEM_SEC_STATE_MASTER_PASS_SET 0x02
+#define CXL_PMEM_SEC_STATE_LOCKED 0x04
+#define CXL_PMEM_SEC_STATE_FROZEN 0x08
+#define CXL_PMEM_SEC_STATE_USER_PLIMIT 0x10
+#define CXL_PMEM_SEC_STATE_MASTER_PLIMIT 0x20
+
+/* set passphrase input payload */
+struct cxl_set_pass {
+ u8 type;
+ u8 reserved[31];
+ /* CXL field using NVDIMM define, same length */
+ u8 old_pass[NVDIMM_PASSPHRASE_LEN];
+ u8 new_pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+/* disable passphrase input payload */
+struct cxl_disable_pass {
+ u8 type;
+ u8 reserved[31];
+ u8 pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+/* passphrase secure erase payload */
+struct cxl_pass_erase {
+ u8 type;
+ u8 reserved[31];
+ u8 pass[NVDIMM_PASSPHRASE_LEN];
+} __packed;
+
+enum {
+ CXL_PMEM_SEC_PASS_MASTER = 0,
+ CXL_PMEM_SEC_PASS_USER,
+};
+
+int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_cmd *cmd);
+int cxl_dev_state_identify(struct cxl_memdev_state *mds);
+int cxl_await_media_ready(struct cxl_dev_state *cxlds);
+int cxl_enumerate_cmds(struct cxl_memdev_state *mds);
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info);
+struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev);
+void set_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+ unsigned long *cmds);
+void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds,
+ unsigned long *cmds);
+void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status);
+void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
+ enum cxl_event_log_type type,
+ enum cxl_event_type event_type,
+ const uuid_t *uuid, union cxl_event *evt);
+int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count);
+int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds);
+int cxl_set_timestamp(struct cxl_memdev_state *mds);
+int cxl_poison_state_init(struct cxl_memdev_state *mds);
+int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
+ struct cxl_region *cxlr);
+int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
+int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa);
+int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa);
+int cxl_inject_poison_locked(struct cxl_memdev *cxlmd, u64 dpa);
+int cxl_clear_poison_locked(struct cxl_memdev *cxlmd, u64 dpa);
+
+#ifdef CONFIG_CXL_EDAC_MEM_FEATURES
+int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd);
+int devm_cxl_region_edac_register(struct cxl_region *cxlr);
+int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt);
+int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt);
+void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd);
+#else
+static inline int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd)
+{ return 0; }
+static inline int devm_cxl_region_edac_register(struct cxl_region *cxlr)
+{ return 0; }
+static inline int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd,
+ union cxl_event *evt)
+{ return 0; }
+static inline int cxl_store_rec_dram(struct cxl_memdev *cxlmd,
+ union cxl_event *evt)
+{ return 0; }
+static inline void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd)
+{ return; }
+#endif
+
+#ifdef CONFIG_CXL_SUSPEND
+void cxl_mem_active_inc(void);
+void cxl_mem_active_dec(void);
+#else
+static inline void cxl_mem_active_inc(void)
+{
+}
+static inline void cxl_mem_active_dec(void)
+{
+}
+#endif
+
+int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd);
+
+/**
+ * struct cxl_hdm - HDM Decoder registers and cached / decoded capabilities
+ * @regs: mapped registers, see devm_cxl_setup_hdm()
+ * @decoder_count: number of decoders for this port
+ * @target_count: for switch decoders, max downstream port targets
+ * @interleave_mask: interleave granularity capability, see check_interleave_cap()
+ * @iw_cap_mask: bitmask of supported interleave ways, see check_interleave_cap()
+ * @port: mapped cxl_port, see devm_cxl_setup_hdm()
+ */
+struct cxl_hdm {
+ struct cxl_component_regs regs;
+ unsigned int decoder_count;
+ unsigned int target_count;
+ unsigned int interleave_mask;
+ unsigned long iw_cap_mask;
+ struct cxl_port *port;
+};
+
+struct seq_file;
+struct dentry *cxl_debugfs_create_dir(const char *dir);
+void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds);
+#endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
new file mode 100644
index 000000000000..1d526bea8431
--- /dev/null
+++ b/drivers/cxl/cxlpci.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+#ifndef __CXL_PCI_H__
+#define __CXL_PCI_H__
+#include <linux/pci.h>
+#include "cxl.h"
+
+#define CXL_MEMORY_PROGIF 0x10
+
+/*
+ * See section 8.1 Configuration Space Registers in the CXL 2.0
+ * Specification. Names are taken straight from the specification with "CXL" and
+ * "DVSEC" redundancies removed. When obvious, abbreviations may be used.
+ */
+#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20)
+
+/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */
+#define CXL_DVSEC_PCIE_DEVICE 0
+#define CXL_DVSEC_CAP_OFFSET 0xA
+#define CXL_DVSEC_MEM_CAPABLE BIT(2)
+#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4)
+#define CXL_DVSEC_CTRL_OFFSET 0xC
+#define CXL_DVSEC_MEM_ENABLE BIT(2)
+#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10))
+#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10))
+#define CXL_DVSEC_MEM_INFO_VALID BIT(0)
+#define CXL_DVSEC_MEM_ACTIVE BIT(1)
+#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28)
+#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10))
+#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10))
+#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28)
+
+#define CXL_DVSEC_RANGE_MAX 2
+
+/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */
+#define CXL_DVSEC_FUNCTION_MAP 2
+
+/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */
+#define CXL_DVSEC_PORT_EXTENSIONS 3
+
+/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
+#define CXL_DVSEC_PORT_GPF 4
+#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0)
+#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8)
+
+/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
+#define CXL_DVSEC_DEVICE_GPF 5
+
+/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */
+#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7
+
+/* CXL 2.0 8.1.9: Register Locator DVSEC */
+#define CXL_DVSEC_REG_LOCATOR 8
+#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC
+#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0)
+#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8)
+#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16)
+
+/*
+ * NOTE: Currently all the functions which are enabled for CXL require their
+ * vectors to be in the first 16. Use this as the default max.
+ */
+#define CXL_PCI_DEFAULT_MAX_VECTORS 16
+
+/* Register Block Identifier (RBI) */
+enum cxl_regloc_type {
+ CXL_REGLOC_RBI_EMPTY = 0,
+ CXL_REGLOC_RBI_COMPONENT,
+ CXL_REGLOC_RBI_VIRT,
+ CXL_REGLOC_RBI_MEMDEV,
+ CXL_REGLOC_RBI_PMU,
+ CXL_REGLOC_RBI_TYPES
+};
+
+/*
+ * Table Access DOE, CDAT Read Entry Response
+ *
+ * Spec refs:
+ *
+ * CXL 3.1 8.1.11, Table 8-14: Read Entry Response
+ * CDAT Specification 1.03: 2 CDAT Data Structures
+ */
+
+struct cdat_header {
+ __le32 length;
+ u8 revision;
+ u8 checksum;
+ u8 reserved[6];
+ __le32 sequence;
+} __packed;
+
+struct cdat_entry_header {
+ u8 type;
+ u8 reserved;
+ __le16 length;
+} __packed;
+
+/*
+ * The DOE CDAT read response contains a CDAT read entry (either the
+ * CDAT header or a structure).
+ */
+union cdat_data {
+ struct cdat_header header;
+ struct cdat_entry_header entry;
+} __packed;
+
+/* There is an additional CDAT response header of 4 bytes. */
+struct cdat_doe_rsp {
+ __le32 doe_header;
+ u8 data[];
+} __packed;
+
+/*
+ * CXL v3.0 6.2.3 Table 6-4
+ * The table indicates that if PCIe Flit Mode is set, then CXL is in 256B flits
+ * mode, otherwise it's 68B flits mode.
+ */
+static inline bool cxl_pci_flit_256(struct pci_dev *pdev)
+{
+ u16 lnksta2;
+
+ pcie_capability_read_word(pdev, PCI_EXP_LNKSTA2, &lnksta2);
+ return lnksta2 & PCI_EXP_LNKSTA2_FLIT;
+}
+
+struct cxl_dev_state;
+void read_cdat_data(struct cxl_port *port);
+void cxl_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
+#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
new file mode 100644
index 000000000000..6e6777b7bafb
--- /dev/null
+++ b/drivers/cxl/mem.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "cxlmem.h"
+#include "cxlpci.h"
+
+/**
+ * DOC: cxl mem
+ *
+ * CXL memory endpoint devices and switches are CXL capable devices that are
+ * participating in CXL.mem protocol. Their functionality builds on top of the
+ * CXL.io protocol that allows enumerating and configuring components via
+ * standard PCI mechanisms.
+ *
+ * The cxl_mem driver owns kicking off the enumeration of this CXL.mem
+ * capability. With the detection of a CXL capable endpoint, the driver will
+ * walk up to find the platform specific port it is connected to, and determine
+ * if there are intervening switches in the path. If there are switches, a
+ * secondary action is to enumerate those (implemented in cxl_core). Finally the
+ * cxl_mem driver adds the device it is bound to as a CXL endpoint-port for use
+ * in higher level operations.
+ */
+
+static void enable_suspend(void *data)
+{
+ cxl_mem_active_dec();
+}
+
+static void remove_debugfs(void *dentry)
+{
+ debugfs_remove_recursive(dentry);
+}
+
+static int cxl_mem_dpa_show(struct seq_file *file, void *data)
+{
+ struct device *dev = file->private;
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+
+ cxl_dpa_debug(file, cxlmd->cxlds);
+
+ return 0;
+}
+
+static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd,
+ struct cxl_dport *parent_dport)
+{
+ struct cxl_port *parent_port = parent_dport->port;
+ struct cxl_port *endpoint, *iter, *down;
+ int rc;
+
+ /*
+ * Now that the path to the root is established record all the
+ * intervening ports in the chain.
+ */
+ for (iter = parent_port, down = NULL; !is_cxl_root(iter);
+ down = iter, iter = to_cxl_port(iter->dev.parent)) {
+ struct cxl_ep *ep;
+
+ ep = cxl_ep_load(iter, cxlmd);
+ ep->next = down;
+ }
+
+ /* Note: endpoint port component registers are derived from @cxlds */
+ endpoint = devm_cxl_add_port(host, &cxlmd->dev, CXL_RESOURCE_NONE,
+ parent_dport);
+ if (IS_ERR(endpoint))
+ return PTR_ERR(endpoint);
+
+ rc = cxl_endpoint_autoremove(cxlmd, endpoint);
+ if (rc)
+ return rc;
+
+ if (!endpoint->dev.driver) {
+ dev_err(&cxlmd->dev, "%s failed probe\n",
+ dev_name(&endpoint->dev));
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int cxl_debugfs_poison_inject(void *data, u64 dpa)
+{
+ struct cxl_memdev *cxlmd = data;
+
+ return cxl_inject_poison(cxlmd, dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
+ cxl_debugfs_poison_inject, "%llx\n");
+
+static int cxl_debugfs_poison_clear(void *data, u64 dpa)
+{
+ struct cxl_memdev *cxlmd = data;
+
+ return cxl_clear_poison(cxlmd, dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
+ cxl_debugfs_poison_clear, "%llx\n");
+
+static int cxl_mem_probe(struct device *dev)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct device *endpoint_parent;
+ struct cxl_dport *dport;
+ struct dentry *dentry;
+ int rc;
+
+ if (!cxlds->media_ready)
+ return -EBUSY;
+
+ /*
+ * Someone is trying to reattach this device after it lost its port
+ * connection (an endpoint port previously registered by this memdev was
+ * disabled). This racy check is ok because if the port is still gone,
+ * no harm done, and if the port hierarchy comes back it will re-trigger
+ * this probe. Port rescan and memdev detach work share the same
+ * single-threaded workqueue.
+ */
+ if (work_pending(&cxlmd->detach_work))
+ return -EBUSY;
+
+ dentry = cxl_debugfs_create_dir(dev_name(dev));
+ debugfs_create_devm_seqfile(dev, "dpamem", dentry, cxl_mem_dpa_show);
+
+ if (test_bit(CXL_POISON_ENABLED_INJECT, mds->poison.enabled_cmds))
+ debugfs_create_file("inject_poison", 0200, dentry, cxlmd,
+ &cxl_poison_inject_fops);
+ if (test_bit(CXL_POISON_ENABLED_CLEAR, mds->poison.enabled_cmds))
+ debugfs_create_file("clear_poison", 0200, dentry, cxlmd,
+ &cxl_poison_clear_fops);
+
+ rc = devm_add_action_or_reset(dev, remove_debugfs, dentry);
+ if (rc)
+ return rc;
+
+ rc = devm_cxl_enumerate_ports(cxlmd);
+ if (rc)
+ return rc;
+
+ struct cxl_port *parent_port __free(put_cxl_port) =
+ cxl_mem_find_port(cxlmd, &dport);
+ if (!parent_port) {
+ dev_err(dev, "CXL port topology not found\n");
+ return -ENXIO;
+ }
+
+ if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) {
+ rc = devm_cxl_add_nvdimm(parent_port, cxlmd);
+ if (rc) {
+ if (rc == -ENODEV)
+ dev_info(dev, "PMEM disabled by platform\n");
+ return rc;
+ }
+ }
+
+ if (dport->rch)
+ endpoint_parent = parent_port->uport_dev;
+ else
+ endpoint_parent = &parent_port->dev;
+
+ cxl_dport_init_ras_reporting(dport, dev);
+
+ scoped_guard(device, endpoint_parent) {
+ if (!endpoint_parent->driver) {
+ dev_err(dev, "CXL port topology %s not enabled\n",
+ dev_name(endpoint_parent));
+ return -ENXIO;
+ }
+
+ rc = devm_cxl_add_endpoint(endpoint_parent, cxlmd, dport);
+ if (rc)
+ return rc;
+ }
+
+ rc = devm_cxl_memdev_edac_register(cxlmd);
+ if (rc)
+ dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc);
+
+ /*
+ * The kernel may be operating out of CXL memory on this device,
+ * there is no spec defined way to determine whether this device
+ * preserves contents over suspend, and there is no simple way
+ * to arrange for the suspend image to avoid CXL memory which
+ * would setup a circular dependency between PCI resume and save
+ * state restoration.
+ *
+ * TODO: support suspend when all the regions this device is
+ * hosting are locked and covered by the system address map,
+ * i.e. platform firmware owns restoring the HDM configuration
+ * that it locked.
+ */
+ cxl_mem_active_inc();
+ return devm_add_action_or_reset(dev, enable_suspend, NULL);
+}
+
+static ssize_t trigger_poison_list_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ bool trigger;
+ int rc;
+
+ if (kstrtobool(buf, &trigger) || !trigger)
+ return -EINVAL;
+
+ rc = cxl_trigger_poison_list(to_cxl_memdev(dev));
+
+ return rc ? rc : len;
+}
+static DEVICE_ATTR_WO(trigger_poison_list);
+
+static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+ if (a == &dev_attr_trigger_poison_list.attr)
+ if (!test_bit(CXL_POISON_ENABLED_LIST,
+ mds->poison.enabled_cmds))
+ return 0;
+
+ return a->mode;
+}
+
+static struct attribute *cxl_mem_attrs[] = {
+ &dev_attr_trigger_poison_list.attr,
+ NULL
+};
+
+static struct attribute_group cxl_mem_group = {
+ .attrs = cxl_mem_attrs,
+ .is_visible = cxl_mem_visible,
+};
+
+__ATTRIBUTE_GROUPS(cxl_mem);
+
+static struct cxl_driver cxl_mem_driver = {
+ .name = "cxl_mem",
+ .probe = cxl_mem_probe,
+ .id = CXL_DEVICE_MEMORY_EXPANDER,
+ .drv = {
+ .dev_groups = cxl_mem_groups,
+ },
+};
+
+module_cxl_driver(cxl_mem_driver);
+
+MODULE_DESCRIPTION("CXL: Memory Expansion");
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS("CXL");
+MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER);
+/*
+ * create_endpoint() wants to validate port driver attach immediately after
+ * endpoint registration.
+ */
+MODULE_SOFTDEP("pre: cxl_port");
diff --git a/drivers/cxl/mem.h b/drivers/cxl/mem.h
deleted file mode 100644
index 8f02d02b26b4..000000000000
--- a/drivers/cxl/mem.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright(c) 2020-2021 Intel Corporation. */
-#ifndef __CXL_MEM_H__
-#define __CXL_MEM_H__
-#include <linux/cdev.h>
-#include "cxl.h"
-
-/* CXL 2.0 8.2.8.5.1.1 Memory Device Status Register */
-#define CXLMDEV_STATUS_OFFSET 0x0
-#define CXLMDEV_DEV_FATAL BIT(0)
-#define CXLMDEV_FW_HALT BIT(1)
-#define CXLMDEV_STATUS_MEDIA_STATUS_MASK GENMASK(3, 2)
-#define CXLMDEV_MS_NOT_READY 0
-#define CXLMDEV_MS_READY 1
-#define CXLMDEV_MS_ERROR 2
-#define CXLMDEV_MS_DISABLED 3
-#define CXLMDEV_READY(status) \
- (FIELD_GET(CXLMDEV_STATUS_MEDIA_STATUS_MASK, status) == \
- CXLMDEV_MS_READY)
-#define CXLMDEV_MBOX_IF_READY BIT(4)
-#define CXLMDEV_RESET_NEEDED_MASK GENMASK(7, 5)
-#define CXLMDEV_RESET_NEEDED_NOT 0
-#define CXLMDEV_RESET_NEEDED_COLD 1
-#define CXLMDEV_RESET_NEEDED_WARM 2
-#define CXLMDEV_RESET_NEEDED_HOT 3
-#define CXLMDEV_RESET_NEEDED_CXL 4
-#define CXLMDEV_RESET_NEEDED(status) \
- (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \
- CXLMDEV_RESET_NEEDED_NOT)
-
-/*
- * An entire PCI topology full of devices should be enough for any
- * config
- */
-#define CXL_MEM_MAX_DEVS 65536
-
-/**
- * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device
- * @dev: driver core device object
- * @cdev: char dev core object for ioctl operations
- * @cxlm: pointer to the parent device driver data
- * @id: id number of this memdev instance.
- */
-struct cxl_memdev {
- struct device dev;
- struct cdev cdev;
- struct cxl_mem *cxlm;
- int id;
-};
-
-/**
- * struct cxl_mem - A CXL memory device
- * @pdev: The PCI device associated with this CXL device.
- * @cxlmd: Logical memory device chardev / interface
- * @regs: Parsed register blocks
- * @payload_size: Size of space for payload
- * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register)
- * @lsa_size: Size of Label Storage Area
- * (CXL 2.0 8.2.9.5.1.1 Identify Memory Device)
- * @mbox_mutex: Mutex to synchronize mailbox access.
- * @firmware_version: Firmware version for the memory device.
- * @enabled_cmds: Hardware commands found enabled in CEL.
- * @pmem_range: Persistent memory capacity information.
- * @ram_range: Volatile memory capacity information.
- */
-struct cxl_mem {
- struct pci_dev *pdev;
- struct cxl_memdev *cxlmd;
-
- struct cxl_regs regs;
-
- size_t payload_size;
- size_t lsa_size;
- struct mutex mbox_mutex; /* Protects device mailbox and firmware */
- char firmware_version[0x10];
- unsigned long *enabled_cmds;
-
- struct range pmem_range;
- struct range ram_range;
-};
-#endif /* __CXL_MEM_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 4cf351a3cf99..0be4e508affe 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1,275 +1,181 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
-#include <uapi/linux/cxl_mem.h>
-#include <linux/security.h>
-#include <linux/debugfs.h>
+#include <linux/unaligned.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/moduleparam.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <linux/sizes.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/cdev.h>
-#include <linux/idr.h>
#include <linux/pci.h>
+#include <linux/aer.h>
#include <linux/io.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
-#include "pci.h"
+#include <cxl/mailbox.h>
+#include "cxlmem.h"
+#include "cxlpci.h"
#include "cxl.h"
-#include "mem.h"
+#include "pmu.h"
/**
* DOC: cxl pci
*
* This implements the PCI exclusive functionality for a CXL device as it is
* defined by the Compute Express Link specification. CXL devices may surface
- * certain functionality even if it isn't CXL enabled.
+ * certain functionality even if it isn't CXL enabled. While this driver is
+ * focused around the PCI specific aspects of a CXL device, it binds to the
+ * specific CXL memory device class code, and therefore the implementation of
+ * cxl_pci is focused around CXL memory devices.
*
* The driver has several responsibilities, mainly:
* - Create the memX device and register on the CXL bus.
* - Enumerate device's register interface and map them.
- * - Probe the device attributes to establish sysfs interface.
- * - Provide an IOCTL interface to userspace to communicate with the device for
- * things like firmware update.
+ * - Registers nvdimm bridge device with cxl_core.
+ * - Registers a CXL mailbox with cxl_core.
*/
-#define cxl_doorbell_busy(cxlm) \
- (readl((cxlm)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
+#define cxl_doorbell_busy(cxlds) \
+ (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
CXLDEV_MBOX_CTRL_DOORBELL)
/* CXL 2.0 - 8.2.8.4 */
#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
-enum opcode {
- CXL_MBOX_OP_INVALID = 0x0000,
- CXL_MBOX_OP_RAW = CXL_MBOX_OP_INVALID,
- CXL_MBOX_OP_GET_FW_INFO = 0x0200,
- CXL_MBOX_OP_ACTIVATE_FW = 0x0202,
- CXL_MBOX_OP_GET_SUPPORTED_LOGS = 0x0400,
- CXL_MBOX_OP_GET_LOG = 0x0401,
- CXL_MBOX_OP_IDENTIFY = 0x4000,
- CXL_MBOX_OP_GET_PARTITION_INFO = 0x4100,
- CXL_MBOX_OP_SET_PARTITION_INFO = 0x4101,
- CXL_MBOX_OP_GET_LSA = 0x4102,
- CXL_MBOX_OP_SET_LSA = 0x4103,
- CXL_MBOX_OP_GET_HEALTH_INFO = 0x4200,
- CXL_MBOX_OP_GET_ALERT_CONFIG = 0x4201,
- CXL_MBOX_OP_SET_ALERT_CONFIG = 0x4202,
- CXL_MBOX_OP_GET_SHUTDOWN_STATE = 0x4203,
- CXL_MBOX_OP_SET_SHUTDOWN_STATE = 0x4204,
- CXL_MBOX_OP_GET_POISON = 0x4300,
- CXL_MBOX_OP_INJECT_POISON = 0x4301,
- CXL_MBOX_OP_CLEAR_POISON = 0x4302,
- CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS = 0x4303,
- CXL_MBOX_OP_SCAN_MEDIA = 0x4304,
- CXL_MBOX_OP_GET_SCAN_MEDIA = 0x4305,
- CXL_MBOX_OP_MAX = 0x10000
-};
-
-/**
- * struct mbox_cmd - A command to be submitted to hardware.
- * @opcode: (input) The command set and command submitted to hardware.
- * @payload_in: (input) Pointer to the input payload.
- * @payload_out: (output) Pointer to the output payload. Must be allocated by
- * the caller.
- * @size_in: (input) Number of bytes to load from @payload_in.
- * @size_out: (input) Max number of bytes loaded into @payload_out.
- * (output) Number of bytes generated by the device. For fixed size
- * outputs commands this is always expected to be deterministic. For
- * variable sized output commands, it tells the exact number of bytes
- * written.
- * @return_code: (output) Error code returned from hardware.
- *
- * This is the primary mechanism used to send commands to the hardware.
- * All the fields except @payload_* correspond exactly to the fields described in
- * Command Register section of the CXL 2.0 8.2.8.4.5. @payload_in and
- * @payload_out are written to, and read from the Command Payload Registers
- * defined in CXL 2.0 8.2.8.4.8.
- */
-struct mbox_cmd {
- u16 opcode;
- void *payload_in;
- void *payload_out;
- size_t size_in;
- size_t size_out;
- u16 return_code;
-#define CXL_MBOX_SUCCESS 0
-};
-
-static int cxl_mem_major;
-static DEFINE_IDA(cxl_memdev_ida);
-static DECLARE_RWSEM(cxl_memdev_rwsem);
-static struct dentry *cxl_debugfs;
-static bool cxl_raw_allow_all;
-
-enum {
- CEL_UUID,
- VENDOR_DEBUG_UUID,
-};
-
-/* See CXL 2.0 Table 170. Get Log Input Payload */
-static const uuid_t log_uuid[] = {
- [CEL_UUID] = UUID_INIT(0xda9c0b5, 0xbf41, 0x4b78, 0x8f, 0x79, 0x96,
- 0xb1, 0x62, 0x3b, 0x3f, 0x17),
- [VENDOR_DEBUG_UUID] = UUID_INIT(0xe1819d9, 0x11a9, 0x400c, 0x81, 0x1f,
- 0xd6, 0x07, 0x19, 0x40, 0x3d, 0x86),
-};
-
-/**
- * struct cxl_mem_command - Driver representation of a memory device command
- * @info: Command information as it exists for the UAPI
- * @opcode: The actual bits used for the mailbox protocol
- * @flags: Set of flags effecting driver behavior.
- *
- * * %CXL_CMD_FLAG_FORCE_ENABLE: In cases of error, commands with this flag
- * will be enabled by the driver regardless of what hardware may have
- * advertised.
- *
- * The cxl_mem_command is the driver's internal representation of commands that
- * are supported by the driver. Some of these commands may not be supported by
- * the hardware. The driver will use @info to validate the fields passed in by
- * the user then submit the @opcode to the hardware.
- *
- * See struct cxl_command_info.
- */
-struct cxl_mem_command {
- struct cxl_command_info info;
- enum opcode opcode;
- u32 flags;
-#define CXL_CMD_FLAG_NONE 0
-#define CXL_CMD_FLAG_FORCE_ENABLE BIT(0)
-};
-
-#define CXL_CMD(_id, sin, sout, _flags) \
- [CXL_MEM_COMMAND_ID_##_id] = { \
- .info = { \
- .id = CXL_MEM_COMMAND_ID_##_id, \
- .size_in = sin, \
- .size_out = sout, \
- }, \
- .opcode = CXL_MBOX_OP_##_id, \
- .flags = _flags, \
- }
-
-/*
- * This table defines the supported mailbox commands for the driver. This table
- * is made up of a UAPI structure. Non-negative values as parameters in the
- * table will be validated against the user's input. For example, if size_in is
- * 0, and the user passed in 1, it is an error.
- */
-static struct cxl_mem_command mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
- CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE),
-#ifdef CONFIG_CXL_MEM_RAW_COMMANDS
- CXL_CMD(RAW, ~0, ~0, 0),
-#endif
- CXL_CMD(GET_SUPPORTED_LOGS, 0, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
- CXL_CMD(GET_FW_INFO, 0, 0x50, 0),
- CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0),
- CXL_CMD(GET_LSA, 0x8, ~0, 0),
- CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
- CXL_CMD(GET_LOG, 0x18, ~0, CXL_CMD_FLAG_FORCE_ENABLE),
- CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
- CXL_CMD(SET_LSA, ~0, 0, 0),
- CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0),
- CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0),
- CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0),
- CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0),
- CXL_CMD(GET_POISON, 0x10, ~0, 0),
- CXL_CMD(INJECT_POISON, 0x8, 0, 0),
- CXL_CMD(CLEAR_POISON, 0x48, 0, 0),
- CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0),
- CXL_CMD(SCAN_MEDIA, 0x11, 0, 0),
- CXL_CMD(GET_SCAN_MEDIA, 0, ~0, 0),
-};
-
-/*
- * Commands that RAW doesn't permit. The rationale for each:
- *
- * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment /
- * coordination of transaction timeout values at the root bridge level.
- *
- * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live
- * and needs to be coordinated with HDM updates.
- *
- * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the
- * driver and any writes from userspace invalidates those contents.
- *
- * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes
- * to the device after it is marked clean, userspace can not make that
- * assertion.
- *
- * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that
- * is kept up to date with patrol notifications and error management.
- */
-static u16 cxl_disabled_raw_commands[] = {
- CXL_MBOX_OP_ACTIVATE_FW,
- CXL_MBOX_OP_SET_PARTITION_INFO,
- CXL_MBOX_OP_SET_LSA,
- CXL_MBOX_OP_SET_SHUTDOWN_STATE,
- CXL_MBOX_OP_SCAN_MEDIA,
- CXL_MBOX_OP_GET_SCAN_MEDIA,
-};
-
/*
- * Command sets that RAW doesn't permit. All opcodes in this set are
- * disabled because they pass plain text security payloads over the
- * user/kernel boundary. This functionality is intended to be wrapped
- * behind the keys ABI which allows for encrypted payloads in the UAPI
+ * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
+ * dictate how long to wait for the mailbox to become ready. The new
+ * field allows the device to tell software the amount of time to wait
+ * before mailbox ready. This field per the spec theoretically allows
+ * for up to 255 seconds. 255 seconds is unreasonably long, its longer
+ * than the maximum SATA port link recovery wait. Default to 60 seconds
+ * until someone builds a CXL device that needs more time in practice.
*/
-static u8 security_command_sets[] = {
- 0x44, /* Sanitize */
- 0x45, /* Persistent Memory Data-at-rest Security */
- 0x46, /* Security Passthrough */
-};
-
-#define cxl_for_each_cmd(cmd) \
- for ((cmd) = &mem_commands[0]; \
- ((cmd) - mem_commands) < ARRAY_SIZE(mem_commands); (cmd)++)
+static unsigned short mbox_ready_timeout = 60;
+module_param(mbox_ready_timeout, ushort, 0644);
+MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
-#define cxl_cmd_count ARRAY_SIZE(mem_commands)
-
-static int cxl_mem_wait_for_doorbell(struct cxl_mem *cxlm)
+static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
{
const unsigned long start = jiffies;
unsigned long end = start;
- while (cxl_doorbell_busy(cxlm)) {
+ while (cxl_doorbell_busy(cxlds)) {
end = jiffies;
if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
/* Check again in case preempted before timeout test */
- if (!cxl_doorbell_busy(cxlm))
+ if (!cxl_doorbell_busy(cxlds))
break;
return -ETIMEDOUT;
}
cpu_relax();
}
- dev_dbg(&cxlm->pdev->dev, "Doorbell wait took %dms",
+ dev_dbg(cxlds->dev, "Doorbell wait took %dms",
jiffies_to_msecs(end) - jiffies_to_msecs(start));
return 0;
}
-static bool cxl_is_security_command(u16 opcode)
+#define cxl_err(dev, status, msg) \
+ dev_err_ratelimited(dev, msg ", device state %s%s\n", \
+ status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
+ status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
+
+#define cxl_cmd_err(dev, cmd, status, msg) \
+ dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \
+ (cmd)->opcode, \
+ status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
+ status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
+
+/*
+ * Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique
+ * wrapper object for each irq within the same cxlds.
+ */
+struct cxl_dev_id {
+ struct cxl_dev_state *cxlds;
+};
+
+static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq,
+ irq_handler_t thread_fn)
+{
+ struct device *dev = cxlds->dev;
+ struct cxl_dev_id *dev_id;
+
+ dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL);
+ if (!dev_id)
+ return -ENOMEM;
+ dev_id->cxlds = cxlds;
+
+ return devm_request_threaded_irq(dev, irq, NULL, thread_fn,
+ IRQF_SHARED | IRQF_ONESHOT, NULL,
+ dev_id);
+}
+
+static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
{
- int i;
+ u64 reg;
- for (i = 0; i < ARRAY_SIZE(security_command_sets); i++)
- if (security_command_sets[i] == (opcode >> 8))
- return true;
- return false;
+ reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+ return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
}
-static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
- struct mbox_cmd *mbox_cmd)
+static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
{
- struct device *dev = &cxlm->pdev->dev;
+ u64 reg;
+ u16 opcode;
+ struct cxl_dev_id *dev_id = id;
+ struct cxl_dev_state *cxlds = dev_id->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+
+ if (!cxl_mbox_background_complete(cxlds))
+ return IRQ_NONE;
+
+ reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+ opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
+ if (opcode == CXL_MBOX_OP_SANITIZE) {
+ mutex_lock(&cxl_mbox->mbox_mutex);
+ if (mds->security.sanitize_node)
+ mod_delayed_work(system_percpu_wq, &mds->security.poll_dwork, 0);
+ mutex_unlock(&cxl_mbox->mbox_mutex);
+ } else {
+ /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
+ rcuwait_wake_up(&cxl_mbox->mbox_wait);
+ }
- dev_dbg(dev, "Mailbox command (opcode: %#x size: %zub) timed out\n",
- mbox_cmd->opcode, mbox_cmd->size_in);
+ return IRQ_HANDLED;
+}
+
+/*
+ * Sanitization operation polling mode.
+ */
+static void cxl_mbox_sanitize_work(struct work_struct *work)
+{
+ struct cxl_memdev_state *mds =
+ container_of(work, typeof(*mds), security.poll_dwork.work);
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+
+ mutex_lock(&cxl_mbox->mbox_mutex);
+ if (cxl_mbox_background_complete(cxlds)) {
+ mds->security.poll_tmo_secs = 0;
+ if (mds->security.sanitize_node)
+ sysfs_notify_dirent(mds->security.sanitize_node);
+ mds->security.sanitize_active = false;
+
+ dev_dbg(cxlds->dev, "Sanitization operation ended\n");
+ } else {
+ int timeout = mds->security.poll_tmo_secs + 10;
+
+ mds->security.poll_tmo_secs = min(15 * 60, timeout);
+ schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ);
+ }
+ mutex_unlock(&cxl_mbox->mbox_mutex);
}
/**
- * __cxl_mem_mbox_send_cmd() - Execute a mailbox command
- * @cxlm: The CXL memory device to communicate with.
+ * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
+ * @cxl_mbox: CXL mailbox context
* @mbox_cmd: Command to send to the memory device.
*
* Context: Any context. Expects mbox_mutex to be held.
@@ -289,15 +195,18 @@ static void cxl_mem_mbox_timeout(struct cxl_mem *cxlm,
* not need to coordinate with each other. The driver only uses the primary
* mailbox.
*/
-static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
- struct mbox_cmd *mbox_cmd)
+static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_cmd *mbox_cmd)
{
- void __iomem *payload = cxlm->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
+ struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox);
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
+ struct device *dev = cxlds->dev;
u64 cmd_reg, status_reg;
size_t out_len;
int rc;
- lockdep_assert_held(&cxlm->mbox_mutex);
+ lockdep_assert_held(&cxl_mbox->mbox_mutex);
/*
* Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
@@ -317,12 +226,25 @@ static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
*/
/* #1 */
- if (cxl_doorbell_busy(cxlm)) {
- dev_err_ratelimited(&cxlm->pdev->dev,
- "Mailbox re-busy after acquiring\n");
+ if (cxl_doorbell_busy(cxlds)) {
+ u64 md_status =
+ readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+ cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
+ "mailbox queue busy");
return -EBUSY;
}
+ /*
+ * With sanitize polling, hardware might be done and the poller still
+ * not be in sync. Ensure no new command comes in until so. Keep the
+ * hardware semantics and only allow device health status.
+ */
+ if (mds->security.poll_tmo_secs > 0) {
+ if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO)
+ return -EBUSY;
+ }
+
cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
mbox_cmd->opcode);
if (mbox_cmd->size_in) {
@@ -335,32 +257,100 @@ static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
}
/* #2, #3 */
- writeq(cmd_reg, cxlm->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+ writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
/* #4 */
- dev_dbg(&cxlm->pdev->dev, "Sending command\n");
+ dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode);
writel(CXLDEV_MBOX_CTRL_DOORBELL,
- cxlm->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+ cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
/* #5 */
- rc = cxl_mem_wait_for_doorbell(cxlm);
+ rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
if (rc == -ETIMEDOUT) {
- cxl_mem_mbox_timeout(cxlm, mbox_cmd);
+ u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+ cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
return rc;
}
/* #6 */
- status_reg = readq(cxlm->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
+ status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
mbox_cmd->return_code =
FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
- if (mbox_cmd->return_code != 0) {
- dev_dbg(&cxlm->pdev->dev, "Mailbox operation had an error\n");
- return 0;
+ /*
+ * Handle the background command in a synchronous manner.
+ *
+ * All other mailbox commands will serialize/queue on the mbox_mutex,
+ * which we currently hold. Furthermore this also guarantees that
+ * cxl_mbox_background_complete() checks are safe amongst each other,
+ * in that no new bg operation can occur in between.
+ *
+ * Background operations are timesliced in accordance with the nature
+ * of the command. In the event of timeout, the mailbox state is
+ * indeterminate until the next successful command submission and the
+ * driver can get back in sync with the hardware state.
+ */
+ if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
+ u64 bg_status_reg;
+ int i, timeout;
+
+ /*
+ * Sanitization is a special case which monopolizes the device
+ * and cannot be timesliced. Handle asynchronously instead,
+ * and allow userspace to poll(2) for completion.
+ */
+ if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) {
+ if (mds->security.sanitize_active)
+ return -EBUSY;
+
+ /* give first timeout a second */
+ timeout = 1;
+ mds->security.poll_tmo_secs = timeout;
+ mds->security.sanitize_active = true;
+ schedule_delayed_work(&mds->security.poll_dwork,
+ timeout * HZ);
+ dev_dbg(dev, "Sanitization operation started\n");
+ goto success;
+ }
+
+ dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
+ mbox_cmd->opcode);
+
+ timeout = mbox_cmd->poll_interval_ms;
+ for (i = 0; i < mbox_cmd->poll_count; i++) {
+ if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait,
+ cxl_mbox_background_complete(cxlds),
+ TASK_UNINTERRUPTIBLE,
+ msecs_to_jiffies(timeout)) > 0)
+ break;
+ }
+
+ if (!cxl_mbox_background_complete(cxlds)) {
+ dev_err(dev, "timeout waiting for background (%d ms)\n",
+ timeout * mbox_cmd->poll_count);
+ return -ETIMEDOUT;
+ }
+
+ bg_status_reg = readq(cxlds->regs.mbox +
+ CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
+ mbox_cmd->return_code =
+ FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
+ bg_status_reg);
+ dev_dbg(dev,
+ "Mailbox background operation (0x%04x) completed\n",
+ mbox_cmd->opcode);
+ }
+
+ if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
+ dev_dbg(dev, "Mailbox operation had an error: %s\n",
+ cxl_mbox_cmd_rc2str(mbox_cmd));
+ return 0; /* completed but caller must check return_code */
}
+success:
/* #7 */
- cmd_reg = readq(cxlm->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
+ cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
/* #8 */
@@ -372,8 +362,9 @@ static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
* have requested less data than the hardware supplied even
* within spec.
*/
- size_t n = min3(mbox_cmd->size_out, cxlm->payload_size, out_len);
+ size_t n;
+ n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len);
memcpy_fromio(mbox_cmd->payload_out, payload, n);
mbox_cmd->size_out = n;
} else {
@@ -383,1299 +374,833 @@ static int __cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm,
return 0;
}
-/**
- * cxl_mem_mbox_get() - Acquire exclusive access to the mailbox.
- * @cxlm: The memory device to gain access to.
- *
- * Context: Any context. Takes the mbox_mutex.
- * Return: 0 if exclusive access was acquired.
- */
-static int cxl_mem_mbox_get(struct cxl_mem *cxlm)
+static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox,
+ struct cxl_mbox_cmd *cmd)
{
- struct device *dev = &cxlm->pdev->dev;
- u64 md_status;
int rc;
- mutex_lock_io(&cxlm->mbox_mutex);
-
- /*
- * XXX: There is some amount of ambiguity in the 2.0 version of the spec
- * around the mailbox interface ready (8.2.8.5.1.1). The purpose of the
- * bit is to allow firmware running on the device to notify the driver
- * that it's ready to receive commands. It is unclear if the bit needs
- * to be read for each transaction mailbox, ie. the firmware can switch
- * it on and off as needed. Second, there is no defined timeout for
- * mailbox ready, like there is for the doorbell interface.
- *
- * Assumptions:
- * 1. The firmware might toggle the Mailbox Interface Ready bit, check
- * it for every command.
- *
- * 2. If the doorbell is clear, the firmware should have first set the
- * Mailbox Interface Ready bit. Therefore, waiting for the doorbell
- * to be ready is sufficient.
- */
- rc = cxl_mem_wait_for_doorbell(cxlm);
- if (rc) {
- dev_warn(dev, "Mailbox interface not ready\n");
- goto out;
- }
-
- md_status = readq(cxlm->regs.memdev + CXLMDEV_STATUS_OFFSET);
- if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
- dev_err(dev, "mbox: reported doorbell ready, but not mbox ready\n");
- rc = -EBUSY;
- goto out;
- }
-
- /*
- * Hardware shouldn't allow a ready status but also have failure bits
- * set. Spit out an error, this should be a bug report
- */
- rc = -EFAULT;
- if (md_status & CXLMDEV_DEV_FATAL) {
- dev_err(dev, "mbox: reported ready, but fatal\n");
- goto out;
- }
- if (md_status & CXLMDEV_FW_HALT) {
- dev_err(dev, "mbox: reported ready, but halted\n");
- goto out;
- }
- if (CXLMDEV_RESET_NEEDED(md_status)) {
- dev_err(dev, "mbox: reported ready, but reset needed\n");
- goto out;
- }
+ mutex_lock(&cxl_mbox->mbox_mutex);
+ rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd);
+ mutex_unlock(&cxl_mbox->mbox_mutex);
- /* with lock held */
- return 0;
-
-out:
- mutex_unlock(&cxlm->mbox_mutex);
return rc;
}
-/**
- * cxl_mem_mbox_put() - Release exclusive access to the mailbox.
- * @cxlm: The CXL memory device to communicate with.
- *
- * Context: Any context. Expects mbox_mutex to be held.
- */
-static void cxl_mem_mbox_put(struct cxl_mem *cxlm)
+static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
{
- mutex_unlock(&cxlm->mbox_mutex);
-}
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
+ const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
+ struct device *dev = cxlds->dev;
+ unsigned long timeout;
+ int irq, msgnum;
+ u64 md_status;
+ u32 ctrl;
-/**
- * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace.
- * @cxlm: The CXL memory device to communicate with.
- * @cmd: The validated command.
- * @in_payload: Pointer to userspace's input payload.
- * @out_payload: Pointer to userspace's output payload.
- * @size_out: (Input) Max payload size to copy out.
- * (Output) Payload size hardware generated.
- * @retval: Hardware generated return code from the operation.
- *
- * Return:
- * * %0 - Mailbox transaction succeeded. This implies the mailbox
- * protocol completed successfully not that the operation itself
- * was successful.
- * * %-ENOMEM - Couldn't allocate a bounce buffer.
- * * %-EFAULT - Something happened with copy_to/from_user.
- * * %-EINTR - Mailbox acquisition interrupted.
- * * %-EXXX - Transaction level failures.
- *
- * Creates the appropriate mailbox command and dispatches it on behalf of a
- * userspace request. The input and output payloads are copied between
- * userspace.
- *
- * See cxl_send_cmd().
- */
-static int handle_mailbox_cmd_from_user(struct cxl_mem *cxlm,
- const struct cxl_mem_command *cmd,
- u64 in_payload, u64 out_payload,
- s32 *size_out, u32 *retval)
-{
- struct device *dev = &cxlm->pdev->dev;
- struct mbox_cmd mbox_cmd = {
- .opcode = cmd->opcode,
- .size_in = cmd->info.size_in,
- .size_out = cmd->info.size_out,
- };
- int rc;
+ timeout = jiffies + mbox_ready_timeout * HZ;
+ do {
+ md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+ if (md_status & CXLMDEV_MBOX_IF_READY)
+ break;
+ if (msleep_interruptible(100))
+ break;
+ } while (!time_after(jiffies, timeout));
- if (cmd->info.size_out) {
- mbox_cmd.payload_out = kvzalloc(cmd->info.size_out, GFP_KERNEL);
- if (!mbox_cmd.payload_out)
- return -ENOMEM;
+ if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
+ cxl_err(dev, md_status, "timeout awaiting mailbox ready");
+ return -ETIMEDOUT;
}
- if (cmd->info.size_in) {
- mbox_cmd.payload_in = vmemdup_user(u64_to_user_ptr(in_payload),
- cmd->info.size_in);
- if (IS_ERR(mbox_cmd.payload_in)) {
- kvfree(mbox_cmd.payload_out);
- return PTR_ERR(mbox_cmd.payload_in);
- }
+ /*
+ * A command may be in flight from a previous driver instance,
+ * think kexec, do one doorbell wait so that
+ * __cxl_pci_mbox_send_cmd() can assume that it is the only
+ * source for future doorbell busy events.
+ */
+ if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
+ cxl_err(dev, md_status, "timeout awaiting mailbox idle");
+ return -ETIMEDOUT;
}
- rc = cxl_mem_mbox_get(cxlm);
- if (rc)
- goto out;
-
- dev_dbg(dev,
- "Submitting %s command for user\n"
- "\topcode: %x\n"
- "\tsize: %ub\n",
- cxl_command_names[cmd->info.id].name, mbox_cmd.opcode,
- cmd->info.size_in);
-
- dev_WARN_ONCE(dev, cmd->info.id == CXL_MEM_COMMAND_ID_RAW,
- "raw command path used\n");
-
- rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
- cxl_mem_mbox_put(cxlm);
- if (rc)
- goto out;
+ cxl_mbox->mbox_send = cxl_pci_mbox_send;
+ cxl_mbox->payload_size =
+ 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
/*
- * @size_out contains the max size that's allowed to be written back out
- * to userspace. While the payload may have written more output than
- * this it will have to be ignored.
+ * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
+ *
+ * If the size is too small, mandatory commands will not work and so
+ * there's no point in going forward. If the size is too large, there's
+ * no harm is soft limiting it.
*/
- if (mbox_cmd.size_out) {
- dev_WARN_ONCE(dev, mbox_cmd.size_out > *size_out,
- "Invalid return size\n");
- if (copy_to_user(u64_to_user_ptr(out_payload),
- mbox_cmd.payload_out, mbox_cmd.size_out)) {
- rc = -EFAULT;
- goto out;
- }
+ cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M);
+ if (cxl_mbox->payload_size < 256) {
+ dev_err(dev, "Mailbox is too small (%zub)",
+ cxl_mbox->payload_size);
+ return -ENXIO;
}
- *size_out = mbox_cmd.size_out;
- *retval = mbox_cmd.return_code;
+ dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size);
-out:
- kvfree(mbox_cmd.payload_in);
- kvfree(mbox_cmd.payload_out);
- return rc;
-}
-
-static bool cxl_mem_raw_command_allowed(u16 opcode)
-{
- int i;
+ INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
- if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
- return false;
-
- if (security_locked_down(LOCKDOWN_NONE))
- return false;
+ /* background command interrupts are optional */
+ if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail)
+ return 0;
- if (cxl_raw_allow_all)
- return true;
+ msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap);
+ irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum);
+ if (irq < 0)
+ return 0;
- if (cxl_is_security_command(opcode))
- return false;
+ if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq))
+ return 0;
- for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++)
- if (cxl_disabled_raw_commands[i] == opcode)
- return false;
+ dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n");
+ /* enable background command mbox irq support */
+ ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
+ ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ;
+ writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
- return true;
+ return 0;
}
-/**
- * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND.
- * @cxlm: &struct cxl_mem device whose mailbox will be used.
- * @send_cmd: &struct cxl_send_command copied in from userspace.
- * @out_cmd: Sanitized and populated &struct cxl_mem_command.
- *
- * Return:
- * * %0 - @out_cmd is ready to send.
- * * %-ENOTTY - Invalid command specified.
- * * %-EINVAL - Reserved fields or invalid values were used.
- * * %-ENOMEM - Input or output buffer wasn't sized properly.
- * * %-EPERM - Attempted to use a protected command.
- *
- * The result of this command is a fully validated command in @out_cmd that is
- * safe to send to the hardware.
- *
- * See handle_mailbox_cmd_from_user()
+/*
+ * Assume that any RCIEP that emits the CXL memory expander class code
+ * is an RCD
*/
-static int cxl_validate_cmd_from_user(struct cxl_mem *cxlm,
- const struct cxl_send_command *send_cmd,
- struct cxl_mem_command *out_cmd)
+static bool is_cxl_restricted(struct pci_dev *pdev)
{
- const struct cxl_command_info *info;
- struct cxl_mem_command *c;
-
- if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX)
- return -ENOTTY;
-
- /*
- * The user can never specify an input payload larger than what hardware
- * supports, but output can be arbitrarily large (simply write out as
- * much data as the hardware provides).
- */
- if (send_cmd->in.size > cxlm->payload_size)
- return -EINVAL;
-
- /*
- * Checks are bypassed for raw commands but a WARN/taint will occur
- * later in the callchain
- */
- if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) {
- const struct cxl_mem_command temp = {
- .info = {
- .id = CXL_MEM_COMMAND_ID_RAW,
- .flags = 0,
- .size_in = send_cmd->in.size,
- .size_out = send_cmd->out.size,
- },
- .opcode = send_cmd->raw.opcode
- };
-
- if (send_cmd->raw.rsvd)
- return -EINVAL;
-
- /*
- * Unlike supported commands, the output size of RAW commands
- * gets passed along without further checking, so it must be
- * validated here.
- */
- if (send_cmd->out.size > cxlm->payload_size)
- return -EINVAL;
-
- if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode))
- return -EPERM;
-
- memcpy(out_cmd, &temp, sizeof(temp));
-
- return 0;
- }
-
- if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK)
- return -EINVAL;
-
- if (send_cmd->rsvd)
- return -EINVAL;
-
- if (send_cmd->in.rsvd || send_cmd->out.rsvd)
- return -EINVAL;
+ return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END;
+}
- /* Convert user's command into the internal representation */
- c = &mem_commands[send_cmd->id];
- info = &c->info;
+static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev,
+ struct cxl_register_map *map,
+ struct cxl_dport *dport)
+{
+ resource_size_t component_reg_phys;
- /* Check that the command is enabled for hardware */
- if (!test_bit(info->id, cxlm->enabled_cmds))
- return -ENOTTY;
+ *map = (struct cxl_register_map) {
+ .host = &pdev->dev,
+ .resource = CXL_RESOURCE_NONE,
+ };
- /* Check the input buffer is the expected size */
- if (info->size_in >= 0 && info->size_in != send_cmd->in.size)
- return -ENOMEM;
+ struct cxl_port *port __free(put_cxl_port) =
+ cxl_pci_find_port(pdev, &dport);
+ if (!port)
+ return -EPROBE_DEFER;
- /* Check the output buffer is at least large enough */
- if (info->size_out >= 0 && send_cmd->out.size < info->size_out)
- return -ENOMEM;
+ component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport);
+ if (component_reg_phys == CXL_RESOURCE_NONE)
+ return -ENXIO;
- memcpy(out_cmd, c, sizeof(*c));
- out_cmd->info.size_in = send_cmd->in.size;
- /*
- * XXX: out_cmd->info.size_out will be controlled by the driver, and the
- * specified number of bytes @send_cmd->out.size will be copied back out
- * to userspace.
- */
+ map->resource = component_reg_phys;
+ map->reg_type = CXL_REGLOC_RBI_COMPONENT;
+ map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
return 0;
}
-static int cxl_query_cmd(struct cxl_memdev *cxlmd,
- struct cxl_mem_query_commands __user *q)
+static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map)
{
- struct device *dev = &cxlmd->dev;
- struct cxl_mem_command *cmd;
- u32 n_commands;
- int j = 0;
-
- dev_dbg(dev, "Query IOCTL\n");
-
- if (get_user(n_commands, &q->n_commands))
- return -EFAULT;
+ int rc;
- /* returns the total number if 0 elements are requested. */
- if (n_commands == 0)
- return put_user(cxl_cmd_count, &q->n_commands);
+ rc = cxl_find_regblock(pdev, type, map);
/*
- * otherwise, return max(n_commands, total commands) cxl_command_info
- * structures.
+ * If the Register Locator DVSEC does not exist, check if it
+ * is an RCH and try to extract the Component Registers from
+ * an RCRB.
*/
- cxl_for_each_cmd(cmd) {
- const struct cxl_command_info *info = &cmd->info;
+ if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev)) {
+ struct cxl_dport *dport;
+ struct cxl_port *port __free(put_cxl_port) =
+ cxl_pci_find_port(pdev, &dport);
+ if (!port)
+ return -EPROBE_DEFER;
+
+ rc = cxl_rcrb_get_comp_regs(pdev, map, dport);
+ if (rc)
+ return rc;
- if (copy_to_user(&q->commands[j++], info, sizeof(*info)))
- return -EFAULT;
+ rc = cxl_dport_map_rcd_linkcap(pdev, dport);
+ if (rc)
+ return rc;
- if (j == n_commands)
- break;
+ } else if (rc) {
+ return rc;
}
- return 0;
+ return cxl_setup_regs(map);
}
-static int cxl_send_cmd(struct cxl_memdev *cxlmd,
- struct cxl_send_command __user *s)
+static int cxl_pci_ras_unmask(struct pci_dev *pdev)
{
- struct cxl_mem *cxlm = cxlmd->cxlm;
- struct device *dev = &cxlmd->dev;
- struct cxl_send_command send;
- struct cxl_mem_command c;
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ void __iomem *addr;
+ u32 orig_val, val, mask;
+ u16 cap;
int rc;
- dev_dbg(dev, "Send IOCTL\n");
+ if (!cxlds->regs.ras) {
+ dev_dbg(&pdev->dev, "No RAS registers.\n");
+ return 0;
+ }
- if (copy_from_user(&send, s, sizeof(send)))
- return -EFAULT;
+ /* BIOS has PCIe AER error control */
+ if (!pcie_aer_is_native(pdev))
+ return 0;
- rc = cxl_validate_cmd_from_user(cxlmd->cxlm, &send, &c);
+ rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap);
if (rc)
return rc;
- /* Prepare to handle a full payload for variable sized output */
- if (c.info.size_out < 0)
- c.info.size_out = cxlm->payload_size;
-
- rc = handle_mailbox_cmd_from_user(cxlm, &c, send.in.payload,
- send.out.payload, &send.out.size,
- &send.retval);
- if (rc)
- return rc;
+ if (cap & PCI_EXP_DEVCTL_URRE) {
+ addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET;
+ orig_val = readl(addr);
+
+ mask = CXL_RAS_UNCORRECTABLE_MASK_MASK |
+ CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK;
+ val = orig_val & ~mask;
+ writel(val, addr);
+ dev_dbg(&pdev->dev,
+ "Uncorrectable RAS Errors Mask: %#x -> %#x\n",
+ orig_val, val);
+ }
- if (copy_to_user(s, &send, sizeof(send)))
- return -EFAULT;
+ if (cap & PCI_EXP_DEVCTL_CERE) {
+ addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET;
+ orig_val = readl(addr);
+ val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK;
+ writel(val, addr);
+ dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n",
+ orig_val, val);
+ }
return 0;
}
-static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd,
- unsigned long arg)
+static void free_event_buf(void *buf)
{
- switch (cmd) {
- case CXL_MEM_QUERY_COMMANDS:
- return cxl_query_cmd(cxlmd, (void __user *)arg);
- case CXL_MEM_SEND_COMMAND:
- return cxl_send_cmd(cxlmd, (void __user *)arg);
- default:
- return -ENOTTY;
- }
+ kvfree(buf);
}
-static long cxl_memdev_ioctl(struct file *file, unsigned int cmd,
- unsigned long arg)
+/*
+ * There is a single buffer for reading event logs from the mailbox. All logs
+ * share this buffer protected by the mds->event_log_lock.
+ */
+static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
{
- struct cxl_memdev *cxlmd = file->private_data;
- int rc = -ENXIO;
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_get_event_payload *buf;
- down_read(&cxl_memdev_rwsem);
- if (cxlmd->cxlm)
- rc = __cxl_memdev_ioctl(cxlmd, cmd, arg);
- up_read(&cxl_memdev_rwsem);
+ buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ mds->event.buf = buf;
- return rc;
+ return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf);
}
-static int cxl_memdev_open(struct inode *inode, struct file *file)
+static bool cxl_alloc_irq_vectors(struct pci_dev *pdev)
{
- struct cxl_memdev *cxlmd =
- container_of(inode->i_cdev, typeof(*cxlmd), cdev);
-
- get_device(&cxlmd->dev);
- file->private_data = cxlmd;
+ int nvecs;
- return 0;
+ /*
+ * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must
+ * not generate INTx messages if that function participates in
+ * CXL.cache or CXL.mem.
+ *
+ * Additionally pci_alloc_irq_vectors() handles calling
+ * pci_free_irq_vectors() automatically despite not being called
+ * pcim_*. See pci_setup_msi_context().
+ */
+ nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS,
+ PCI_IRQ_MSIX | PCI_IRQ_MSI);
+ if (nvecs < 1) {
+ dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs);
+ return false;
+ }
+ return true;
}
-static int cxl_memdev_release_file(struct inode *inode, struct file *file)
+static irqreturn_t cxl_event_thread(int irq, void *id)
{
- struct cxl_memdev *cxlmd =
- container_of(inode->i_cdev, typeof(*cxlmd), cdev);
+ struct cxl_dev_id *dev_id = id;
+ struct cxl_dev_state *cxlds = dev_id->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ u32 status;
- put_device(&cxlmd->dev);
+ do {
+ /*
+ * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status;
+ * ignore the reserved upper 32 bits
+ */
+ status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET);
+ /* Ignore logs unknown to the driver */
+ status &= CXLDEV_EVENT_STATUS_ALL;
+ if (!status)
+ break;
+ cxl_mem_get_event_records(mds, status);
+ cond_resched();
+ } while (status);
- return 0;
+ return IRQ_HANDLED;
}
-static const struct file_operations cxl_memdev_fops = {
- .owner = THIS_MODULE,
- .unlocked_ioctl = cxl_memdev_ioctl,
- .open = cxl_memdev_open,
- .release = cxl_memdev_release_file,
- .compat_ioctl = compat_ptr_ioctl,
- .llseek = noop_llseek,
-};
-
-static inline struct cxl_mem_command *cxl_mem_find_command(u16 opcode)
+static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting)
{
- struct cxl_mem_command *c;
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int irq;
- cxl_for_each_cmd(c)
- if (c->opcode == opcode)
- return c;
+ if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX)
+ return -ENXIO;
+
+ irq = pci_irq_vector(pdev,
+ FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting));
+ if (irq < 0)
+ return irq;
- return NULL;
+ return cxl_request_irq(cxlds, irq, cxl_event_thread);
}
-/**
- * cxl_mem_mbox_send_cmd() - Send a mailbox command to a memory device.
- * @cxlm: The CXL memory device to communicate with.
- * @opcode: Opcode for the mailbox command.
- * @in: The input payload for the mailbox command.
- * @in_size: The length of the input payload
- * @out: Caller allocated buffer for the output.
- * @out_size: Expected size of output.
- *
- * Context: Any context. Will acquire and release mbox_mutex.
- * Return:
- * * %>=0 - Number of bytes returned in @out.
- * * %-E2BIG - Payload is too large for hardware.
- * * %-EBUSY - Couldn't acquire exclusive mailbox access.
- * * %-EFAULT - Hardware error occurred.
- * * %-ENXIO - Command completed, but device reported an error.
- * * %-EIO - Unexpected output size.
- *
- * Mailbox commands may execute successfully yet the device itself reported an
- * error. While this distinction can be useful for commands from userspace, the
- * kernel will only be able to use results when both are successful.
- *
- * See __cxl_mem_mbox_send_cmd()
- */
-static int cxl_mem_mbox_send_cmd(struct cxl_mem *cxlm, u16 opcode,
- void *in, size_t in_size,
- void *out, size_t out_size)
+static int cxl_event_get_int_policy(struct cxl_memdev_state *mds,
+ struct cxl_event_interrupt_policy *policy)
{
- const struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
- struct mbox_cmd mbox_cmd = {
- .opcode = opcode,
- .payload_in = in,
- .size_in = in_size,
- .size_out = out_size,
- .payload_out = out,
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd = {
+ .opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY,
+ .payload_out = policy,
+ .size_out = sizeof(*policy),
};
int rc;
- if (out_size > cxlm->payload_size)
- return -E2BIG;
-
- rc = cxl_mem_mbox_get(cxlm);
- if (rc)
- return rc;
-
- rc = __cxl_mem_mbox_send_cmd(cxlm, &mbox_cmd);
- cxl_mem_mbox_put(cxlm);
- if (rc)
- return rc;
-
- /* TODO: Map return code to proper kernel style errno */
- if (mbox_cmd.return_code != CXL_MBOX_SUCCESS)
- return -ENXIO;
-
- /*
- * Variable sized commands can't be validated and so it's up to the
- * caller to do that if they wish.
- */
- if (cmd->info.size_out >= 0 && mbox_cmd.size_out != out_size)
- return -EIO;
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ dev_err(mds->cxlds.dev,
+ "Failed to get event interrupt policy : %d", rc);
- return 0;
+ return rc;
}
-static int cxl_mem_setup_mailbox(struct cxl_mem *cxlm)
+static int cxl_event_config_msgnums(struct cxl_memdev_state *mds,
+ struct cxl_event_interrupt_policy *policy)
{
- const int cap = readl(cxlm->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
- cxlm->payload_size =
- 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
+ *policy = (struct cxl_event_interrupt_policy) {
+ .info_settings = CXL_INT_MSI_MSIX,
+ .warn_settings = CXL_INT_MSI_MSIX,
+ .failure_settings = CXL_INT_MSI_MSIX,
+ .fatal_settings = CXL_INT_MSI_MSIX,
+ };
- /*
- * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
- *
- * If the size is too small, mandatory commands will not work and so
- * there's no point in going forward. If the size is too large, there's
- * no harm is soft limiting it.
- */
- cxlm->payload_size = min_t(size_t, cxlm->payload_size, SZ_1M);
- if (cxlm->payload_size < 256) {
- dev_err(&cxlm->pdev->dev, "Mailbox is too small (%zub)",
- cxlm->payload_size);
- return -ENXIO;
- }
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY,
+ .payload_in = policy,
+ .size_in = sizeof(*policy),
+ };
- dev_dbg(&cxlm->pdev->dev, "Mailbox payload sized %zu",
- cxlm->payload_size);
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0) {
+ dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d",
+ rc);
+ return rc;
+ }
- return 0;
+ /* Retrieve final interrupt settings */
+ return cxl_event_get_int_policy(mds, policy);
}
-static struct cxl_mem *cxl_mem_create(struct pci_dev *pdev)
+static int cxl_event_irqsetup(struct cxl_memdev_state *mds)
{
- struct device *dev = &pdev->dev;
- struct cxl_mem *cxlm;
+ struct cxl_dev_state *cxlds = &mds->cxlds;
+ struct cxl_event_interrupt_policy policy;
+ int rc;
- cxlm = devm_kzalloc(dev, sizeof(*cxlm), GFP_KERNEL);
- if (!cxlm) {
- dev_err(dev, "No memory available\n");
- return ERR_PTR(-ENOMEM);
- }
+ rc = cxl_event_config_msgnums(mds, &policy);
+ if (rc)
+ return rc;
- mutex_init(&cxlm->mbox_mutex);
- cxlm->pdev = pdev;
- cxlm->enabled_cmds =
- devm_kmalloc_array(dev, BITS_TO_LONGS(cxl_cmd_count),
- sizeof(unsigned long),
- GFP_KERNEL | __GFP_ZERO);
- if (!cxlm->enabled_cmds) {
- dev_err(dev, "No memory available for bitmap\n");
- return ERR_PTR(-ENOMEM);
+ rc = cxl_event_req_irq(cxlds, policy.info_settings);
+ if (rc) {
+ dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n");
+ return rc;
}
- return cxlm;
-}
-
-static void __iomem *cxl_mem_map_regblock(struct cxl_mem *cxlm,
- u8 bar, u64 offset)
-{
- struct pci_dev *pdev = cxlm->pdev;
- struct device *dev = &pdev->dev;
- void __iomem *addr;
-
- /* Basic sanity check that BAR is big enough */
- if (pci_resource_len(pdev, bar) < offset) {
- dev_err(dev, "BAR%d: %pr: too small (offset: %#llx)\n", bar,
- &pdev->resource[bar], (unsigned long long)offset);
- return IOMEM_ERR_PTR(-ENXIO);
+ rc = cxl_event_req_irq(cxlds, policy.warn_settings);
+ if (rc) {
+ dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n");
+ return rc;
}
- addr = pci_iomap(pdev, bar, 0);
- if (!addr) {
- dev_err(dev, "failed to map registers\n");
- return addr;
+ rc = cxl_event_req_irq(cxlds, policy.failure_settings);
+ if (rc) {
+ dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n");
+ return rc;
}
- dev_dbg(dev, "Mapped CXL Memory Device resource bar %u @ %#llx\n",
- bar, offset);
+ rc = cxl_event_req_irq(cxlds, policy.fatal_settings);
+ if (rc) {
+ dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n");
+ return rc;
+ }
- return addr;
+ return 0;
}
-static void cxl_mem_unmap_regblock(struct cxl_mem *cxlm, void __iomem *base)
+static bool cxl_event_int_is_fw(u8 setting)
{
- pci_iounmap(cxlm->pdev, base);
+ u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting);
+
+ return mode == CXL_INT_FW;
}
-static int cxl_mem_dvsec(struct pci_dev *pdev, int dvsec)
+static int cxl_event_config(struct pci_host_bridge *host_bridge,
+ struct cxl_memdev_state *mds, bool irq_avail)
{
- int pos;
+ struct cxl_event_interrupt_policy policy;
+ int rc;
- pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DVSEC);
- if (!pos)
+ /*
+ * When BIOS maintains CXL error reporting control, it will process
+ * event records. Only one agent can do so.
+ */
+ if (!host_bridge->native_cxl_error)
return 0;
- while (pos) {
- u16 vendor, id;
+ if (!irq_avail) {
+ dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n");
+ return 0;
+ }
- pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER1, &vendor);
- pci_read_config_word(pdev, pos + PCI_DVSEC_HEADER2, &id);
- if (vendor == PCI_DVSEC_VENDOR_ID_CXL && dvsec == id)
- return pos;
+ rc = cxl_event_get_int_policy(mds, &policy);
+ if (rc)
+ return rc;
- pos = pci_find_next_ext_capability(pdev, pos,
- PCI_EXT_CAP_ID_DVSEC);
+ if (cxl_event_int_is_fw(policy.info_settings) ||
+ cxl_event_int_is_fw(policy.warn_settings) ||
+ cxl_event_int_is_fw(policy.failure_settings) ||
+ cxl_event_int_is_fw(policy.fatal_settings)) {
+ dev_err(mds->cxlds.dev,
+ "FW still in control of Event Logs despite _OSC settings\n");
+ return -EBUSY;
}
- return 0;
-}
-
-static int cxl_probe_regs(struct cxl_mem *cxlm, void __iomem *base,
- struct cxl_register_map *map)
-{
- struct pci_dev *pdev = cxlm->pdev;
- struct device *dev = &pdev->dev;
- struct cxl_component_reg_map *comp_map;
- struct cxl_device_reg_map *dev_map;
-
- switch (map->reg_type) {
- case CXL_REGLOC_RBI_COMPONENT:
- comp_map = &map->component_map;
- cxl_probe_component_regs(dev, base, comp_map);
- if (!comp_map->hdm_decoder.valid) {
- dev_err(dev, "HDM decoder registers not found\n");
- return -ENXIO;
- }
+ rc = cxl_mem_alloc_event_buf(mds);
+ if (rc)
+ return rc;
- dev_dbg(dev, "Set up component registers\n");
- break;
- case CXL_REGLOC_RBI_MEMDEV:
- dev_map = &map->device_map;
- cxl_probe_device_regs(dev, base, dev_map);
- if (!dev_map->status.valid || !dev_map->mbox.valid ||
- !dev_map->memdev.valid) {
- dev_err(dev, "registers not found: %s%s%s\n",
- !dev_map->status.valid ? "status " : "",
- !dev_map->mbox.valid ? "status " : "",
- !dev_map->memdev.valid ? "status " : "");
- return -ENXIO;
- }
+ rc = cxl_event_irqsetup(mds);
+ if (rc)
+ return rc;
- dev_dbg(dev, "Probing device registers...\n");
- break;
- default:
- break;
- }
+ cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL);
return 0;
}
-static int cxl_map_regs(struct cxl_mem *cxlm, struct cxl_register_map *map)
+static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds)
{
- struct pci_dev *pdev = cxlm->pdev;
- struct device *dev = &pdev->dev;
+ int rc;
- switch (map->reg_type) {
- case CXL_REGLOC_RBI_COMPONENT:
- cxl_map_component_regs(pdev, &cxlm->regs.component, map);
- dev_dbg(dev, "Mapping component registers...\n");
- break;
- case CXL_REGLOC_RBI_MEMDEV:
- cxl_map_device_regs(pdev, &cxlm->regs.device_regs, map);
- dev_dbg(dev, "Probing device registers...\n");
- break;
- default:
- break;
- }
+ /*
+ * Fail the init if there's no mailbox. For a type3 this is out of spec.
+ */
+ if (!cxlds->reg_map.device_map.mbox.valid)
+ return -ENODEV;
- return 0;
-}
+ rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev);
+ if (rc)
+ return rc;
-static void cxl_decode_register_block(u32 reg_lo, u32 reg_hi,
- u8 *bar, u64 *offset, u8 *reg_type)
-{
- *offset = ((u64)reg_hi << 32) | (reg_lo & CXL_REGLOC_ADDR_MASK);
- *bar = FIELD_GET(CXL_REGLOC_BIR_MASK, reg_lo);
- *reg_type = FIELD_GET(CXL_REGLOC_RBI_MASK, reg_lo);
+ return 0;
}
-/**
- * cxl_mem_setup_regs() - Setup necessary MMIO.
- * @cxlm: The CXL memory device to communicate with.
- *
- * Return: 0 if all necessary registers mapped.
- *
- * A memory device is required by spec to implement a certain set of MMIO
- * regions. The purpose of this function is to enumerate and map those
- * registers.
- */
-static int cxl_mem_setup_regs(struct cxl_mem *cxlm)
+static ssize_t rcd_pcie_cap_emit(struct device *dev, u16 offset, char *buf, size_t width)
{
- struct pci_dev *pdev = cxlm->pdev;
- struct device *dev = &pdev->dev;
- u32 regloc_size, regblocks;
- void __iomem *base;
- int regloc, i;
- struct cxl_register_map *map, *n;
- LIST_HEAD(register_maps);
- int ret = 0;
-
- regloc = cxl_mem_dvsec(pdev, PCI_DVSEC_ID_CXL_REGLOC_DVSEC_ID);
- if (!regloc) {
- dev_err(dev, "register location dvsec not found\n");
+ struct cxl_dev_state *cxlds = dev_get_drvdata(dev);
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct device *root_dev;
+ struct cxl_dport *dport;
+ struct cxl_port *root __free(put_cxl_port) =
+ cxl_mem_find_port(cxlmd, &dport);
+
+ if (!root)
return -ENXIO;
- }
-
- if (pci_request_mem_regions(pdev, pci_name(pdev)))
- return -ENODEV;
-
- /* Get the size of the Register Locator DVSEC */
- pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
- regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
-
- regloc += PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET;
- regblocks = (regloc_size - PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET) / 8;
-
- for (i = 0; i < regblocks; i++, regloc += 8) {
- u32 reg_lo, reg_hi;
- u8 reg_type;
- u64 offset;
- u8 bar;
-
- map = kzalloc(sizeof(*map), GFP_KERNEL);
- if (!map) {
- ret = -ENOMEM;
- goto free_maps;
- }
-
- list_add(&map->list, &register_maps);
-
- pci_read_config_dword(pdev, regloc, &reg_lo);
- pci_read_config_dword(pdev, regloc + 4, &reg_hi);
-
- cxl_decode_register_block(reg_lo, reg_hi, &bar, &offset,
- &reg_type);
- dev_dbg(dev, "Found register block in bar %u @ 0x%llx of type %u\n",
- bar, offset, reg_type);
-
- base = cxl_mem_map_regblock(cxlm, bar, offset);
- if (!base) {
- ret = -ENOMEM;
- goto free_maps;
- }
-
- map->barno = bar;
- map->block_offset = offset;
- map->reg_type = reg_type;
-
- ret = cxl_probe_regs(cxlm, base + offset, map);
-
- /* Always unmap the regblock regardless of probe success */
- cxl_mem_unmap_regblock(cxlm, base);
-
- if (ret)
- goto free_maps;
- }
+ root_dev = root->uport_dev;
+ if (!root_dev)
+ return -ENXIO;
- pci_release_mem_regions(pdev);
+ if (!dport->regs.rcd_pcie_cap)
+ return -ENXIO;
- list_for_each_entry(map, &register_maps, list) {
- ret = cxl_map_regs(cxlm, map);
- if (ret)
- goto free_maps;
- }
+ guard(device)(root_dev);
+ if (!root_dev->driver)
+ return -ENXIO;
-free_maps:
- list_for_each_entry_safe(map, n, &register_maps, list) {
- list_del(&map->list);
- kfree(map);
+ switch (width) {
+ case 2:
+ return sysfs_emit(buf, "%#x\n",
+ readw(dport->regs.rcd_pcie_cap + offset));
+ case 4:
+ return sysfs_emit(buf, "%#x\n",
+ readl(dport->regs.rcd_pcie_cap + offset));
+ default:
+ return -EINVAL;
}
-
- return ret;
-}
-
-static struct cxl_memdev *to_cxl_memdev(struct device *dev)
-{
- return container_of(dev, struct cxl_memdev, dev);
}
-static void cxl_memdev_release(struct device *dev)
+static ssize_t rcd_link_cap_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-
- ida_free(&cxl_memdev_ida, cxlmd->id);
- kfree(cxlmd);
+ return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCAP, buf, sizeof(u32));
}
+static DEVICE_ATTR_RO(rcd_link_cap);
-static char *cxl_memdev_devnode(struct device *dev, umode_t *mode, kuid_t *uid,
- kgid_t *gid)
+static ssize_t rcd_link_ctrl_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev));
+ return rcd_pcie_cap_emit(dev, PCI_EXP_LNKCTL, buf, sizeof(u16));
}
+static DEVICE_ATTR_RO(rcd_link_ctrl);
-static ssize_t firmware_version_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t rcd_link_status_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_mem *cxlm = cxlmd->cxlm;
-
- return sysfs_emit(buf, "%.16s\n", cxlm->firmware_version);
+ return rcd_pcie_cap_emit(dev, PCI_EXP_LNKSTA, buf, sizeof(u16));
}
-static DEVICE_ATTR_RO(firmware_version);
+static DEVICE_ATTR_RO(rcd_link_status);
-static ssize_t payload_max_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_mem *cxlm = cxlmd->cxlm;
-
- return sysfs_emit(buf, "%zu\n", cxlm->payload_size);
-}
-static DEVICE_ATTR_RO(payload_max);
+static struct attribute *cxl_rcd_attrs[] = {
+ &dev_attr_rcd_link_cap.attr,
+ &dev_attr_rcd_link_ctrl.attr,
+ &dev_attr_rcd_link_status.attr,
+ NULL
+};
-static ssize_t label_storage_size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static umode_t cxl_rcd_visible(struct kobject *kobj, struct attribute *a, int n)
{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_mem *cxlm = cxlmd->cxlm;
-
- return sysfs_emit(buf, "%zu\n", cxlm->lsa_size);
-}
-static DEVICE_ATTR_RO(label_storage_size);
+ struct device *dev = kobj_to_dev(kobj);
+ struct pci_dev *pdev = to_pci_dev(dev);
-static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_mem *cxlm = cxlmd->cxlm;
- unsigned long long len = range_len(&cxlm->ram_range);
+ if (is_cxl_restricted(pdev))
+ return a->mode;
- return sysfs_emit(buf, "%#llx\n", len);
+ return 0;
}
-static struct device_attribute dev_attr_ram_size =
- __ATTR(size, 0444, ram_size_show, NULL);
+static struct attribute_group cxl_rcd_group = {
+ .attrs = cxl_rcd_attrs,
+ .is_visible = cxl_rcd_visible,
+};
+__ATTRIBUTE_GROUPS(cxl_rcd);
-static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
- struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
- struct cxl_mem *cxlm = cxlmd->cxlm;
- unsigned long long len = range_len(&cxlm->pmem_range);
-
- return sysfs_emit(buf, "%#llx\n", len);
-}
+ struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
+ struct cxl_dpa_info range_info = { 0 };
+ struct cxl_memdev_state *mds;
+ struct cxl_dev_state *cxlds;
+ struct cxl_register_map map;
+ struct cxl_memdev *cxlmd;
+ int rc, pmu_count;
+ unsigned int i;
+ bool irq_avail;
-static struct device_attribute dev_attr_pmem_size =
- __ATTR(size, 0444, pmem_size_show, NULL);
+ /*
+ * Double check the anonymous union trickery in struct cxl_regs
+ * FIXME switch to struct_group()
+ */
+ BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
+ offsetof(struct cxl_regs, device_regs.memdev));
-static struct attribute *cxl_memdev_attributes[] = {
- &dev_attr_firmware_version.attr,
- &dev_attr_payload_max.attr,
- &dev_attr_label_storage_size.attr,
- NULL,
-};
+ rc = pcim_enable_device(pdev);
+ if (rc)
+ return rc;
+ pci_set_master(pdev);
+
+ mds = cxl_memdev_state_create(&pdev->dev);
+ if (IS_ERR(mds))
+ return PTR_ERR(mds);
+ cxlds = &mds->cxlds;
+ pci_set_drvdata(pdev, cxlds);
+
+ cxlds->rcd = is_cxl_restricted(pdev);
+ cxlds->serial = pci_get_dsn(pdev);
+ cxlds->cxl_dvsec = pci_find_dvsec_capability(
+ pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
+ if (!cxlds->cxl_dvsec)
+ dev_warn(&pdev->dev,
+ "Device DVSEC not present, skip CXL.mem init\n");
+
+ rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
+ if (rc)
+ return rc;
-static struct attribute *cxl_memdev_pmem_attributes[] = {
- &dev_attr_pmem_size.attr,
- NULL,
-};
+ rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs);
+ if (rc)
+ return rc;
-static struct attribute *cxl_memdev_ram_attributes[] = {
- &dev_attr_ram_size.attr,
- NULL,
-};
+ /*
+ * If the component registers can't be found, the cxl_pci driver may
+ * still be useful for management functions so don't return an error.
+ */
+ rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT,
+ &cxlds->reg_map);
+ if (rc)
+ dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
+ else if (!cxlds->reg_map.component_map.ras.valid)
+ dev_dbg(&pdev->dev, "RAS registers not found\n");
-static struct attribute_group cxl_memdev_attribute_group = {
- .attrs = cxl_memdev_attributes,
-};
+ rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component,
+ BIT(CXL_CM_CAP_CAP_ID_RAS));
+ if (rc)
+ dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
-static struct attribute_group cxl_memdev_ram_attribute_group = {
- .name = "ram",
- .attrs = cxl_memdev_ram_attributes,
-};
+ rc = cxl_pci_type3_init_mailbox(cxlds);
+ if (rc)
+ return rc;
-static struct attribute_group cxl_memdev_pmem_attribute_group = {
- .name = "pmem",
- .attrs = cxl_memdev_pmem_attributes,
-};
+ rc = cxl_await_media_ready(cxlds);
+ if (rc == 0)
+ cxlds->media_ready = true;
+ else
+ dev_warn(&pdev->dev, "Media not active (%d)\n", rc);
-static const struct attribute_group *cxl_memdev_attribute_groups[] = {
- &cxl_memdev_attribute_group,
- &cxl_memdev_ram_attribute_group,
- &cxl_memdev_pmem_attribute_group,
- NULL,
-};
+ irq_avail = cxl_alloc_irq_vectors(pdev);
-static const struct device_type cxl_memdev_type = {
- .name = "cxl_memdev",
- .release = cxl_memdev_release,
- .devnode = cxl_memdev_devnode,
- .groups = cxl_memdev_attribute_groups,
-};
+ rc = cxl_pci_setup_mailbox(mds, irq_avail);
+ if (rc)
+ return rc;
-static void cxl_memdev_shutdown(struct cxl_memdev *cxlmd)
-{
- down_write(&cxl_memdev_rwsem);
- cxlmd->cxlm = NULL;
- up_write(&cxl_memdev_rwsem);
-}
+ rc = cxl_enumerate_cmds(mds);
+ if (rc)
+ return rc;
-static void cxl_memdev_unregister(void *_cxlmd)
-{
- struct cxl_memdev *cxlmd = _cxlmd;
- struct device *dev = &cxlmd->dev;
+ rc = cxl_set_timestamp(mds);
+ if (rc)
+ return rc;
- cdev_device_del(&cxlmd->cdev, dev);
- cxl_memdev_shutdown(cxlmd);
- put_device(dev);
-}
+ rc = cxl_poison_state_init(mds);
+ if (rc)
+ return rc;
-static struct cxl_memdev *cxl_memdev_alloc(struct cxl_mem *cxlm)
-{
- struct pci_dev *pdev = cxlm->pdev;
- struct cxl_memdev *cxlmd;
- struct device *dev;
- struct cdev *cdev;
- int rc;
+ rc = cxl_dev_state_identify(mds);
+ if (rc)
+ return rc;
- cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL);
- if (!cxlmd)
- return ERR_PTR(-ENOMEM);
+ rc = cxl_mem_dpa_fetch(mds, &range_info);
+ if (rc)
+ return rc;
- rc = ida_alloc_range(&cxl_memdev_ida, 0, CXL_MEM_MAX_DEVS, GFP_KERNEL);
- if (rc < 0)
- goto err;
- cxlmd->id = rc;
-
- dev = &cxlmd->dev;
- device_initialize(dev);
- dev->parent = &pdev->dev;
- dev->bus = &cxl_bus_type;
- dev->devt = MKDEV(cxl_mem_major, cxlmd->id);
- dev->type = &cxl_memdev_type;
- device_set_pm_not_required(dev);
-
- cdev = &cxlmd->cdev;
- cdev_init(cdev, &cxl_memdev_fops);
- return cxlmd;
-
-err:
- kfree(cxlmd);
- return ERR_PTR(rc);
-}
+ rc = cxl_dpa_setup(cxlds, &range_info);
+ if (rc)
+ return rc;
-static struct cxl_memdev *devm_cxl_add_memdev(struct device *host,
- struct cxl_mem *cxlm)
-{
- struct cxl_memdev *cxlmd;
- struct device *dev;
- struct cdev *cdev;
- int rc;
+ rc = devm_cxl_setup_features(cxlds);
+ if (rc)
+ dev_dbg(&pdev->dev, "No CXL Features discovered\n");
- cxlmd = cxl_memdev_alloc(cxlm);
+ cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
if (IS_ERR(cxlmd))
- return cxlmd;
+ return PTR_ERR(cxlmd);
- dev = &cxlmd->dev;
- rc = dev_set_name(dev, "mem%d", cxlmd->id);
+ rc = devm_cxl_setup_fw_upload(&pdev->dev, mds);
if (rc)
- goto err;
-
- /*
- * Activate ioctl operations, no cxl_memdev_rwsem manipulation
- * needed as this is ordered with cdev_add() publishing the device.
- */
- cxlmd->cxlm = cxlm;
+ return rc;
- cdev = &cxlmd->cdev;
- rc = cdev_device_add(cdev, dev);
+ rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd);
if (rc)
- goto err;
+ return rc;
- rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd);
+ rc = devm_cxl_setup_fwctl(&pdev->dev, cxlmd);
if (rc)
- return ERR_PTR(rc);
- return cxlmd;
+ dev_dbg(&pdev->dev, "No CXL FWCTL setup\n");
-err:
- /*
- * The cdev was briefly live, shutdown any ioctl operations that
- * saw that state.
- */
- cxl_memdev_shutdown(cxlmd);
- put_device(dev);
- return ERR_PTR(rc);
-}
+ pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU);
+ if (pmu_count < 0)
+ return pmu_count;
-static int cxl_xfer_log(struct cxl_mem *cxlm, uuid_t *uuid, u32 size, u8 *out)
-{
- u32 remaining = size;
- u32 offset = 0;
-
- while (remaining) {
- u32 xfer_size = min_t(u32, remaining, cxlm->payload_size);
- struct cxl_mbox_get_log {
- uuid_t uuid;
- __le32 offset;
- __le32 length;
- } __packed log = {
- .uuid = *uuid,
- .offset = cpu_to_le32(offset),
- .length = cpu_to_le32(xfer_size)
- };
- int rc;
-
- rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_LOG, &log,
- sizeof(log), out, xfer_size);
- if (rc < 0)
- return rc;
-
- out += xfer_size;
- remaining -= xfer_size;
- offset += xfer_size;
- }
-
- return 0;
-}
+ for (i = 0; i < pmu_count; i++) {
+ struct cxl_pmu_regs pmu_regs;
-/**
- * cxl_walk_cel() - Walk through the Command Effects Log.
- * @cxlm: Device.
- * @size: Length of the Command Effects Log.
- * @cel: CEL
- *
- * Iterate over each entry in the CEL and determine if the driver supports the
- * command. If so, the command is enabled for the device and can be used later.
- */
-static void cxl_walk_cel(struct cxl_mem *cxlm, size_t size, u8 *cel)
-{
- struct cel_entry {
- __le16 opcode;
- __le16 effect;
- } __packed * cel_entry;
- const int cel_entries = size / sizeof(*cel_entry);
- int i;
-
- cel_entry = (struct cel_entry *)cel;
-
- for (i = 0; i < cel_entries; i++) {
- u16 opcode = le16_to_cpu(cel_entry[i].opcode);
- struct cxl_mem_command *cmd = cxl_mem_find_command(opcode);
-
- if (!cmd) {
- dev_dbg(&cxlm->pdev->dev,
- "Opcode 0x%04x unsupported by driver", opcode);
- continue;
+ rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i);
+ if (rc) {
+ dev_dbg(&pdev->dev, "Could not find PMU regblock\n");
+ break;
}
- set_bit(cmd->info.id, cxlm->enabled_cmds);
- }
-}
-
-struct cxl_mbox_get_supported_logs {
- __le16 entries;
- u8 rsvd[6];
- struct gsl_entry {
- uuid_t uuid;
- __le32 size;
- } __packed entry[];
-} __packed;
-
-static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_mem *cxlm)
-{
- struct cxl_mbox_get_supported_logs *ret;
- int rc;
-
- ret = kvmalloc(cxlm->payload_size, GFP_KERNEL);
- if (!ret)
- return ERR_PTR(-ENOMEM);
-
- rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_GET_SUPPORTED_LOGS, NULL,
- 0, ret, cxlm->payload_size);
- if (rc < 0) {
- kvfree(ret);
- return ERR_PTR(rc);
- }
-
- return ret;
-}
-
-/**
- * cxl_mem_enumerate_cmds() - Enumerate commands for a device.
- * @cxlm: The device.
- *
- * Returns 0 if enumerate completed successfully.
- *
- * CXL devices have optional support for certain commands. This function will
- * determine the set of supported commands for the hardware and update the
- * enabled_cmds bitmap in the @cxlm.
- */
-static int cxl_mem_enumerate_cmds(struct cxl_mem *cxlm)
-{
- struct cxl_mbox_get_supported_logs *gsl;
- struct device *dev = &cxlm->pdev->dev;
- struct cxl_mem_command *cmd;
- int i, rc;
-
- gsl = cxl_get_gsl(cxlm);
- if (IS_ERR(gsl))
- return PTR_ERR(gsl);
-
- rc = -ENOENT;
- for (i = 0; i < le16_to_cpu(gsl->entries); i++) {
- u32 size = le32_to_cpu(gsl->entry[i].size);
- uuid_t uuid = gsl->entry[i].uuid;
- u8 *log;
-
- dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size);
-
- if (!uuid_equal(&uuid, &log_uuid[CEL_UUID]))
- continue;
-
- log = kvmalloc(size, GFP_KERNEL);
- if (!log) {
- rc = -ENOMEM;
- goto out;
+ rc = cxl_map_pmu_regs(&map, &pmu_regs);
+ if (rc) {
+ dev_dbg(&pdev->dev, "Could not map PMU regs\n");
+ break;
}
- rc = cxl_xfer_log(cxlm, &uuid, size, log);
+ rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV);
if (rc) {
- kvfree(log);
- goto out;
+ dev_dbg(&pdev->dev, "Could not add PMU instance\n");
+ break;
}
+ }
- cxl_walk_cel(cxlm, size, log);
- kvfree(log);
+ rc = cxl_event_config(host_bridge, mds, irq_avail);
+ if (rc)
+ return rc;
- /* In case CEL was bogus, enable some default commands. */
- cxl_for_each_cmd(cmd)
- if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE)
- set_bit(cmd->info.id, cxlm->enabled_cmds);
+ if (cxl_pci_ras_unmask(pdev))
+ dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
- /* Found the required CEL */
- rc = 0;
- }
+ pci_save_state(pdev);
-out:
- kvfree(gsl);
return rc;
}
-/**
- * cxl_mem_identify() - Send the IDENTIFY command to the device.
- * @cxlm: The device to identify.
- *
- * Return: 0 if identify was executed successfully.
- *
- * This will dispatch the identify command to the device and on success populate
- * structures to be exported to sysfs.
- */
-static int cxl_mem_identify(struct cxl_mem *cxlm)
-{
- /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */
- struct cxl_mbox_identify {
- char fw_revision[0x10];
- __le64 total_capacity;
- __le64 volatile_capacity;
- __le64 persistent_capacity;
- __le64 partition_align;
- __le16 info_event_log_size;
- __le16 warning_event_log_size;
- __le16 failure_event_log_size;
- __le16 fatal_event_log_size;
- __le32 lsa_size;
- u8 poison_list_max_mer[3];
- __le16 inject_poison_limit;
- u8 poison_caps;
- u8 qos_telemetry_caps;
- } __packed id;
- int rc;
-
- rc = cxl_mem_mbox_send_cmd(cxlm, CXL_MBOX_OP_IDENTIFY, NULL, 0, &id,
- sizeof(id));
- if (rc < 0)
- return rc;
-
- /*
- * TODO: enumerate DPA map, as 'ram' and 'pmem' do not alias.
- * For now, only the capacity is exported in sysfs
- */
- cxlm->ram_range.start = 0;
- cxlm->ram_range.end = le64_to_cpu(id.volatile_capacity) * SZ_256M - 1;
-
- cxlm->pmem_range.start = 0;
- cxlm->pmem_range.end =
- le64_to_cpu(id.persistent_capacity) * SZ_256M - 1;
+static const struct pci_device_id cxl_mem_pci_tbl[] = {
+ /* PCI class code for CXL.mem Type-3 Devices */
+ { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
+ { /* terminate list */ },
+};
+MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
- cxlm->lsa_size = le32_to_cpu(id.lsa_size);
- memcpy(cxlm->firmware_version, id.fw_revision, sizeof(id.fw_revision));
+static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
+{
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct device *dev = &cxlmd->dev;
- return 0;
+ dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
+ dev_name(dev));
+ pci_restore_state(pdev);
+ if (device_attach(dev) <= 0)
+ return PCI_ERS_RESULT_DISCONNECT;
+ return PCI_ERS_RESULT_RECOVERED;
}
-static int cxl_mem_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static void cxl_error_resume(struct pci_dev *pdev)
{
- struct cxl_memdev *cxlmd;
- struct cxl_mem *cxlm;
- int rc;
-
- rc = pcim_enable_device(pdev);
- if (rc)
- return rc;
-
- cxlm = cxl_mem_create(pdev);
- if (IS_ERR(cxlm))
- return PTR_ERR(cxlm);
-
- rc = cxl_mem_setup_regs(cxlm);
- if (rc)
- return rc;
-
- rc = cxl_mem_setup_mailbox(cxlm);
- if (rc)
- return rc;
-
- rc = cxl_mem_enumerate_cmds(cxlm);
- if (rc)
- return rc;
-
- rc = cxl_mem_identify(cxlm);
- if (rc)
- return rc;
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct device *dev = &cxlmd->dev;
- cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlm);
- if (IS_ERR(cxlmd))
- return PTR_ERR(cxlmd);
+ dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
+ dev->driver ? "successful" : "failed");
+}
- if (range_len(&cxlm->pmem_range) && IS_ENABLED(CONFIG_CXL_PMEM))
- rc = devm_cxl_add_nvdimm(&pdev->dev, cxlmd);
+static void cxl_reset_done(struct pci_dev *pdev)
+{
+ struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+ struct cxl_memdev *cxlmd = cxlds->cxlmd;
+ struct device *dev = &pdev->dev;
- return rc;
+ /*
+ * FLR does not expect to touch the HDM decoders and related
+ * registers. SBR, however, will wipe all device configurations.
+ * Issue a warning if there was an active decoder before the reset
+ * that no longer exists.
+ */
+ guard(device)(&cxlmd->dev);
+ if (cxlmd->endpoint &&
+ cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) {
+ dev_crit(dev, "SBR happened without memory regions removal.\n");
+ dev_crit(dev, "System may be unstable if regions hosted system memory.\n");
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
}
-static const struct pci_device_id cxl_mem_pci_tbl[] = {
- /* PCI class code for CXL.mem Type-3 Devices */
- { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
- { /* terminate list */ },
+static const struct pci_error_handlers cxl_error_handlers = {
+ .error_detected = cxl_error_detected,
+ .slot_reset = cxl_slot_reset,
+ .resume = cxl_error_resume,
+ .cor_error_detected = cxl_cor_error_detected,
+ .reset_done = cxl_reset_done,
};
-MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
-static struct pci_driver cxl_mem_driver = {
+static struct pci_driver cxl_pci_driver = {
.name = KBUILD_MODNAME,
.id_table = cxl_mem_pci_tbl,
- .probe = cxl_mem_probe,
+ .probe = cxl_pci_probe,
+ .err_handler = &cxl_error_handlers,
+ .dev_groups = cxl_rcd_groups,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
};
-static __init int cxl_mem_init(void)
+#define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
+static void cxl_handle_cper_event(enum cxl_event_type ev_type,
+ struct cxl_cper_event_rec *rec)
{
- struct dentry *mbox_debugfs;
- dev_t devt;
- int rc;
+ struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
+ struct pci_dev *pdev __free(pci_dev_put) = NULL;
+ enum cxl_event_log_type log_type;
+ struct cxl_dev_state *cxlds;
+ unsigned int devfn;
+ u32 hdr_flags;
+
+ pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
+ device_id->segment_num, device_id->bus_num,
+ device_id->device_num, device_id->func_num);
+
+ devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
+ pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
+ device_id->bus_num, devfn);
+ if (!pdev)
+ return;
+
+ guard(device)(&pdev->dev);
+ if (pdev->driver != &cxl_pci_driver)
+ return;
+
+ cxlds = pci_get_drvdata(pdev);
+ if (!cxlds)
+ return;
+
+ /* Fabricate a log type */
+ hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
+ log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
+
+ cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
+ &uuid_null, &rec->event);
+}
- /* Double check the anonymous union trickery in struct cxl_regs */
- BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
- offsetof(struct cxl_regs, device_regs.memdev));
+static void cxl_cper_work_fn(struct work_struct *work)
+{
+ struct cxl_cper_work_data wd;
- rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl");
- if (rc)
- return rc;
+ while (cxl_cper_kfifo_get(&wd))
+ cxl_handle_cper_event(wd.event_type, &wd.rec);
+}
+static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
- cxl_mem_major = MAJOR(devt);
+static int __init cxl_pci_driver_init(void)
+{
+ int rc;
- rc = pci_register_driver(&cxl_mem_driver);
- if (rc) {
- unregister_chrdev_region(MKDEV(cxl_mem_major, 0),
- CXL_MEM_MAX_DEVS);
+ rc = pci_register_driver(&cxl_pci_driver);
+ if (rc)
return rc;
- }
- cxl_debugfs = debugfs_create_dir("cxl", NULL);
- mbox_debugfs = debugfs_create_dir("mbox", cxl_debugfs);
- debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs,
- &cxl_raw_allow_all);
+ rc = cxl_cper_register_work(&cxl_cper_work);
+ if (rc)
+ pci_unregister_driver(&cxl_pci_driver);
- return 0;
+ return rc;
}
-static __exit void cxl_mem_exit(void)
+static void __exit cxl_pci_driver_exit(void)
{
- debugfs_remove_recursive(cxl_debugfs);
- pci_unregister_driver(&cxl_mem_driver);
- unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS);
+ cxl_cper_unregister_work(&cxl_cper_work);
+ cancel_work_sync(&cxl_cper_work);
+ pci_unregister_driver(&cxl_pci_driver);
}
+module_init(cxl_pci_driver_init);
+module_exit(cxl_pci_driver_exit);
+MODULE_DESCRIPTION("CXL: PCI manageability");
MODULE_LICENSE("GPL v2");
-module_init(cxl_mem_init);
-module_exit(cxl_mem_exit);
-MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS("CXL");
diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h
deleted file mode 100644
index dad7a831f65f..000000000000
--- a/drivers/cxl/pci.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
-#ifndef __CXL_PCI_H__
-#define __CXL_PCI_H__
-
-#define CXL_MEMORY_PROGIF 0x10
-
-/*
- * See section 8.1 Configuration Space Registers in the CXL 2.0
- * Specification
- */
-#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20)
-#define PCI_DVSEC_VENDOR_ID_CXL 0x1E98
-#define PCI_DVSEC_ID_CXL 0x0
-
-#define PCI_DVSEC_ID_CXL_REGLOC_DVSEC_ID 0x8
-#define PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET 0xC
-
-/* BAR Indicator Register (BIR) */
-#define CXL_REGLOC_BIR_MASK GENMASK(2, 0)
-
-/* Register Block Identifier (RBI) */
-#define CXL_REGLOC_RBI_MASK GENMASK(15, 8)
-#define CXL_REGLOC_RBI_EMPTY 0
-#define CXL_REGLOC_RBI_COMPONENT 1
-#define CXL_REGLOC_RBI_VIRT 2
-#define CXL_REGLOC_RBI_MEMDEV 3
-
-#define CXL_REGLOC_ADDR_MASK GENMASK(31, 16)
-
-#endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c
index 0088e41dd2f3..e197883690ef 100644
--- a/drivers/cxl/pmem.c
+++ b/drivers/cxl/pmem.c
@@ -1,230 +1,542 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
#include <linux/libnvdimm.h>
+#include <linux/unaligned.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/ndctl.h>
#include <linux/async.h>
#include <linux/slab.h>
-#include "mem.h"
+#include <linux/nd.h>
+#include "cxlmem.h"
#include "cxl.h"
-/*
- * Ordered workqueue for cxl nvdimm device arrival and departure
- * to coordinate bus rescans when a bridge arrives and trigger remove
- * operations when the bridge is removed.
- */
-static struct workqueue_struct *cxl_pmem_wq;
+static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);
+
+static void clear_exclusive(void *mds)
+{
+ clear_exclusive_cxl_commands(mds, exclusive_cmds);
+}
static void unregister_nvdimm(void *nvdimm)
{
nvdimm_delete(nvdimm);
}
-static int match_nvdimm_bridge(struct device *dev, const void *data)
+static ssize_t provider_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ return sysfs_emit(buf, "%s\n", dev_name(&cxl_nvd->dev));
+}
+static DEVICE_ATTR_RO(provider);
+
+static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_dev_state *cxlds = cxl_nvd->cxlmd->cxlds;
+
+ return sysfs_emit(buf, "%lld\n", cxlds->serial);
+}
+static DEVICE_ATTR_RO(id);
+
+static ssize_t dirty_shutdown_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- return strcmp(dev_name(dev), "nvdimm-bridge") == 0;
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns);
}
+static DEVICE_ATTR_RO(dirty_shutdown);
+
+static struct attribute *cxl_dimm_attributes[] = {
+ &dev_attr_id.attr,
+ &dev_attr_provider.attr,
+ &dev_attr_dirty_shutdown.attr,
+ NULL
+};
-static struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(void)
+#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX
+static umode_t cxl_dimm_visible(struct kobject *kobj,
+ struct attribute *a, int n)
{
- struct device *dev;
+ if (a == &dev_attr_dirty_shutdown.attr) {
+ struct device *dev = kobj_to_dev(kobj);
+ struct nvdimm *nvdimm = to_nvdimm(dev);
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+
+ if (cxl_nvd->dirty_shutdowns ==
+ CXL_INVALID_DIRTY_SHUTDOWN_COUNT)
+ return 0;
+ }
- dev = bus_find_device(&cxl_bus_type, NULL, NULL, match_nvdimm_bridge);
- if (!dev)
- return NULL;
- return to_cxl_nvdimm_bridge(dev);
+ return a->mode;
+}
+
+static const struct attribute_group cxl_dimm_attribute_group = {
+ .name = "cxl",
+ .attrs = cxl_dimm_attributes,
+ .is_visible = cxl_dimm_visible
+};
+
+static const struct attribute_group *cxl_dimm_attribute_groups[] = {
+ &cxl_dimm_attribute_group,
+ NULL
+};
+
+static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd)
+{
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+ struct device *dev = &cxl_nvd->dev;
+ u32 count;
+
+ /*
+ * Dirty tracking is enabled and exposed to the user, only when:
+ * - dirty shutdown on the device can be set, and,
+ * - the device has a Device GPF DVSEC (albeit unused), and,
+ * - the Get Health Info cmd can retrieve the device's dirty count.
+ */
+ cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT;
+
+ if (cxl_arm_dirty_shutdown(mds)) {
+ dev_warn(dev, "GPF: could not set dirty shutdown state\n");
+ return;
+ }
+
+ if (!cxl_gpf_get_dvsec(cxlds->dev))
+ return;
+
+ if (cxl_get_dirty_count(mds, &count)) {
+ dev_warn(dev, "GPF: could not retrieve dirty count\n");
+ return;
+ }
+
+ cxl_nvd->dirty_shutdowns = count;
}
static int cxl_nvdimm_probe(struct device *dev)
{
struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
- struct cxl_nvdimm_bridge *cxl_nvb;
- unsigned long flags = 0;
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ unsigned long flags = 0, cmd_mask = 0;
struct nvdimm *nvdimm;
- int rc = -ENXIO;
-
- cxl_nvb = cxl_find_nvdimm_bridge();
- if (!cxl_nvb)
- return -ENXIO;
+ int rc;
- device_lock(&cxl_nvb->dev);
- if (!cxl_nvb->nvdimm_bus)
- goto out;
+ set_exclusive_cxl_commands(mds, exclusive_cmds);
+ rc = devm_add_action_or_reset(dev, clear_exclusive, mds);
+ if (rc)
+ return rc;
set_bit(NDD_LABELING, &flags);
- nvdimm = nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, NULL, flags, 0, 0,
- NULL);
- if (!nvdimm)
- goto out;
+ set_bit(NDD_REGISTER_SYNC, &flags);
+ set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
+ set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
+ set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
- rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
-out:
- device_unlock(&cxl_nvb->dev);
- put_device(&cxl_nvb->dev);
+ /*
+ * Set dirty shutdown now, with the expectation that the device
+ * clear it upon a successful GPF flow. The exception to this
+ * is upon Viral detection, per CXL 3.2 section 12.4.2.
+ */
+ cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd);
- return rc;
+ nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd,
+ cxl_dimm_attribute_groups, flags,
+ cmd_mask, 0, NULL, cxl_nvd->dev_id,
+ cxl_security_ops, NULL);
+ if (!nvdimm)
+ return -ENOMEM;
+
+ dev_set_drvdata(dev, nvdimm);
+ return devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
}
static struct cxl_driver cxl_nvdimm_driver = {
.name = "cxl_nvdimm",
.probe = cxl_nvdimm_probe,
.id = CXL_DEVICE_NVDIMM,
+ .drv = {
+ .suppress_bind_attrs = true,
+ },
};
-static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc,
- struct nvdimm *nvdimm, unsigned int cmd, void *buf,
- unsigned int buf_len, int *cmd_rc)
+static int cxl_pmem_get_config_size(struct cxl_memdev_state *mds,
+ struct nd_cmd_get_config_size *cmd,
+ unsigned int buf_len)
{
- return -ENOTTY;
-}
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
-static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb)
-{
- if (cxl_nvb->nvdimm_bus)
- return true;
- cxl_nvb->nvdimm_bus =
- nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc);
- return cxl_nvb->nvdimm_bus != NULL;
+ if (sizeof(*cmd) > buf_len)
+ return -EINVAL;
+
+ *cmd = (struct nd_cmd_get_config_size){
+ .config_size = mds->lsa_size,
+ .max_xfer =
+ cxl_mbox->payload_size - sizeof(struct cxl_mbox_set_lsa),
+ };
+
+ return 0;
}
-static int cxl_nvdimm_release_driver(struct device *dev, void *data)
+static int cxl_pmem_get_config_data(struct cxl_memdev_state *mds,
+ struct nd_cmd_get_config_data_hdr *cmd,
+ unsigned int buf_len)
{
- if (!is_cxl_nvdimm(dev))
- return 0;
- device_release_driver(dev);
- return 0;
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_get_lsa get_lsa;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ if (sizeof(*cmd) > buf_len)
+ return -EINVAL;
+ if (struct_size(cmd, out_buf, cmd->in_length) > buf_len)
+ return -EINVAL;
+
+ get_lsa = (struct cxl_mbox_get_lsa) {
+ .offset = cpu_to_le32(cmd->in_offset),
+ .length = cpu_to_le32(cmd->in_length),
+ };
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_LSA,
+ .payload_in = &get_lsa,
+ .size_in = sizeof(get_lsa),
+ .size_out = cmd->in_length,
+ .payload_out = cmd->out_buf,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ cmd->status = 0;
+
+ return rc;
}
-static void offline_nvdimm_bus(struct nvdimm_bus *nvdimm_bus)
+static int cxl_pmem_set_config_data(struct cxl_memdev_state *mds,
+ struct nd_cmd_set_config_hdr *cmd,
+ unsigned int buf_len)
{
- if (!nvdimm_bus)
- return;
+ struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
+ struct cxl_mbox_set_lsa *set_lsa;
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ if (sizeof(*cmd) > buf_len)
+ return -EINVAL;
+
+ /* 4-byte status follows the input data in the payload */
+ if (size_add(struct_size(cmd, in_buf, cmd->in_length), 4) > buf_len)
+ return -EINVAL;
+
+ set_lsa =
+ kvzalloc(struct_size(set_lsa, data, cmd->in_length), GFP_KERNEL);
+ if (!set_lsa)
+ return -ENOMEM;
+
+ *set_lsa = (struct cxl_mbox_set_lsa) {
+ .offset = cpu_to_le32(cmd->in_offset),
+ };
+ memcpy(set_lsa->data, cmd->in_buf, cmd->in_length);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_LSA,
+ .payload_in = set_lsa,
+ .size_in = struct_size(set_lsa, data, cmd->in_length),
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
/*
- * Set the state of cxl_nvdimm devices to unbound / idle before
- * nvdimm_bus_unregister() rips the nvdimm objects out from
- * underneath them.
+ * Set "firmware" status (4-packed bytes at the end of the input
+ * payload.
*/
- bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_release_driver);
- nvdimm_bus_unregister(nvdimm_bus);
+ put_unaligned(0, (u32 *) &cmd->in_buf[cmd->in_length]);
+ kvfree(set_lsa);
+
+ return rc;
}
-static void cxl_nvb_update_state(struct work_struct *work)
-{
- struct cxl_nvdimm_bridge *cxl_nvb =
- container_of(work, typeof(*cxl_nvb), state_work);
- struct nvdimm_bus *victim_bus = NULL;
- bool release = false, rescan = false;
-
- device_lock(&cxl_nvb->dev);
- switch (cxl_nvb->state) {
- case CXL_NVB_ONLINE:
- if (!online_nvdimm_bus(cxl_nvb)) {
- dev_err(&cxl_nvb->dev,
- "failed to establish nvdimm bus\n");
- release = true;
- } else
- rescan = true;
- break;
- case CXL_NVB_OFFLINE:
- case CXL_NVB_DEAD:
- victim_bus = cxl_nvb->nvdimm_bus;
- cxl_nvb->nvdimm_bus = NULL;
- break;
+static int cxl_pmem_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd,
+ void *buf, unsigned int buf_len)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+ if (!test_bit(cmd, &cmd_mask))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case ND_CMD_GET_CONFIG_SIZE:
+ return cxl_pmem_get_config_size(mds, buf, buf_len);
+ case ND_CMD_GET_CONFIG_DATA:
+ return cxl_pmem_get_config_data(mds, buf, buf_len);
+ case ND_CMD_SET_CONFIG_DATA:
+ return cxl_pmem_set_config_data(mds, buf, buf_len);
default:
- break;
+ return -ENOTTY;
}
- device_unlock(&cxl_nvb->dev);
+}
- if (release)
- device_release_driver(&cxl_nvb->dev);
- if (rescan) {
- int rc = bus_rescan_devices(&cxl_bus_type);
+static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc,
+ struct nvdimm *nvdimm, unsigned int cmd, void *buf,
+ unsigned int buf_len, int *cmd_rc)
+{
+ /*
+ * No firmware response to translate, let the transport error
+ * code take precedence.
+ */
+ *cmd_rc = 0;
- dev_dbg(&cxl_nvb->dev, "rescan: %d\n", rc);
- }
- offline_nvdimm_bus(victim_bus);
+ if (!nvdimm)
+ return -ENOTTY;
+ return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len);
+}
- put_device(&cxl_nvb->dev);
+static int detach_nvdimm(struct device *dev, void *data)
+{
+ struct cxl_nvdimm *cxl_nvd;
+ bool release = false;
+
+ if (!is_cxl_nvdimm(dev))
+ return 0;
+
+ scoped_guard(device, dev) {
+ if (dev->driver) {
+ cxl_nvd = to_cxl_nvdimm(dev);
+ if (cxl_nvd->cxlmd && cxl_nvd->cxlmd->cxl_nvb == data)
+ release = true;
+ }
+ }
+ if (release)
+ device_release_driver(dev);
+ return 0;
}
-static void cxl_nvdimm_bridge_remove(struct device *dev)
+static void unregister_nvdimm_bus(void *_cxl_nvb)
{
- struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
+ struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb;
+ struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus;
- if (cxl_nvb->state == CXL_NVB_ONLINE)
- cxl_nvb->state = CXL_NVB_OFFLINE;
- if (queue_work(cxl_pmem_wq, &cxl_nvb->state_work))
- get_device(&cxl_nvb->dev);
+ bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb, detach_nvdimm);
+
+ cxl_nvb->nvdimm_bus = NULL;
+ nvdimm_bus_unregister(nvdimm_bus);
}
static int cxl_nvdimm_bridge_probe(struct device *dev)
{
struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);
- if (cxl_nvb->state == CXL_NVB_DEAD)
- return -ENXIO;
-
- if (cxl_nvb->state == CXL_NVB_NEW) {
- cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) {
- .provider_name = "CXL",
- .module = THIS_MODULE,
- .ndctl = cxl_pmem_ctl,
- };
+ cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) {
+ .provider_name = "CXL",
+ .module = THIS_MODULE,
+ .ndctl = cxl_pmem_ctl,
+ };
- INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state);
- }
+ cxl_nvb->nvdimm_bus =
+ nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc);
- cxl_nvb->state = CXL_NVB_ONLINE;
- if (queue_work(cxl_pmem_wq, &cxl_nvb->state_work))
- get_device(&cxl_nvb->dev);
+ if (!cxl_nvb->nvdimm_bus)
+ return -ENOMEM;
- return 0;
+ return devm_add_action_or_reset(dev, unregister_nvdimm_bus, cxl_nvb);
}
static struct cxl_driver cxl_nvdimm_bridge_driver = {
.name = "cxl_nvdimm_bridge",
.probe = cxl_nvdimm_bridge_probe,
- .remove = cxl_nvdimm_bridge_remove,
.id = CXL_DEVICE_NVDIMM_BRIDGE,
+ .drv = {
+ .suppress_bind_attrs = true,
+ },
+};
+
+static void unregister_nvdimm_region(void *nd_region)
+{
+ nvdimm_region_delete(nd_region);
+}
+
+static void cxlr_pmem_remove_resource(void *res)
+{
+ remove_resource(res);
+}
+
+struct cxl_pmem_region_info {
+ u64 offset;
+ u64 serial;
+};
+
+static int cxl_pmem_region_probe(struct device *dev)
+{
+ struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
+ struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
+ struct cxl_region *cxlr = cxlr_pmem->cxlr;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+ struct cxl_pmem_region_info *info = NULL;
+ struct nd_interleave_set *nd_set;
+ struct nd_region_desc ndr_desc;
+ struct cxl_nvdimm *cxl_nvd;
+ struct nvdimm *nvdimm;
+ struct resource *res;
+ int rc, i = 0;
+
+ memset(&mappings, 0, sizeof(mappings));
+ memset(&ndr_desc, 0, sizeof(ndr_desc));
+
+ res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ res->name = "Persistent Memory";
+ res->start = cxlr_pmem->hpa_range.start;
+ res->end = cxlr_pmem->hpa_range.end;
+ res->flags = IORESOURCE_MEM;
+ res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+ rc = insert_resource(&iomem_resource, res);
+ if (rc)
+ return rc;
+
+ rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
+ if (rc)
+ return rc;
+
+ ndr_desc.res = res;
+ ndr_desc.provider_data = cxlr_pmem;
+
+ ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
+ ndr_desc.target_node = phys_to_target_node(res->start);
+ if (ndr_desc.target_node == NUMA_NO_NODE) {
+ ndr_desc.target_node = ndr_desc.numa_node;
+ dev_dbg(&cxlr->dev, "changing target node from %d to %d",
+ NUMA_NO_NODE, ndr_desc.target_node);
+ }
+
+ nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
+ if (!nd_set)
+ return -ENOMEM;
+
+ ndr_desc.memregion = cxlr->id;
+ set_bit(ND_REGION_CXL, &ndr_desc.flags);
+ set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);
+
+ info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
+ struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+ struct cxl_memdev *cxlmd = m->cxlmd;
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+
+ cxl_nvd = cxlmd->cxl_nvd;
+ nvdimm = dev_get_drvdata(&cxl_nvd->dev);
+ if (!nvdimm) {
+ dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
+ dev_name(&cxlmd->dev));
+ rc = -ENODEV;
+ goto out_nvd;
+ }
+
+ if (cxlds->serial == 0) {
+ /* include missing alongside invalid in this error message. */
+ dev_err(dev, "%s: invalid or missing serial number\n",
+ dev_name(&cxlmd->dev));
+ rc = -ENXIO;
+ goto out_nvd;
+ }
+ info[i].serial = cxlds->serial;
+ info[i].offset = m->start;
+
+ m->cxl_nvd = cxl_nvd;
+ mappings[i] = (struct nd_mapping_desc) {
+ .nvdimm = nvdimm,
+ .start = m->start,
+ .size = m->size,
+ .position = i,
+ };
+ }
+ ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
+ ndr_desc.mapping = mappings;
+
+ /*
+ * TODO enable CXL labels which skip the need for 'interleave-set cookie'
+ */
+ nd_set->cookie1 =
+ nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
+ nd_set->cookie2 = nd_set->cookie1;
+ ndr_desc.nd_set = nd_set;
+
+ cxlr_pmem->nd_region =
+ nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
+ if (!cxlr_pmem->nd_region) {
+ rc = -ENOMEM;
+ goto out_nvd;
+ }
+
+ rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
+ cxlr_pmem->nd_region);
+out_nvd:
+ kfree(info);
+
+ return rc;
+}
+
+static struct cxl_driver cxl_pmem_region_driver = {
+ .name = "cxl_pmem_region",
+ .probe = cxl_pmem_region_probe,
+ .id = CXL_DEVICE_PMEM_REGION,
+ .drv = {
+ .suppress_bind_attrs = true,
+ },
};
static __init int cxl_pmem_init(void)
{
int rc;
- cxl_pmem_wq = alloc_ordered_workqueue("cxl_pmem", 0);
- if (!cxl_pmem_wq)
- return -ENXIO;
+ set_bit(CXL_MEM_COMMAND_ID_SET_SHUTDOWN_STATE, exclusive_cmds);
+ set_bit(CXL_MEM_COMMAND_ID_SET_LSA, exclusive_cmds);
rc = cxl_driver_register(&cxl_nvdimm_bridge_driver);
if (rc)
- goto err_bridge;
+ return rc;
rc = cxl_driver_register(&cxl_nvdimm_driver);
if (rc)
goto err_nvdimm;
+ rc = cxl_driver_register(&cxl_pmem_region_driver);
+ if (rc)
+ goto err_region;
+
return 0;
+err_region:
+ cxl_driver_unregister(&cxl_nvdimm_driver);
err_nvdimm:
cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
-err_bridge:
- destroy_workqueue(cxl_pmem_wq);
return rc;
}
static __exit void cxl_pmem_exit(void)
{
+ cxl_driver_unregister(&cxl_pmem_region_driver);
cxl_driver_unregister(&cxl_nvdimm_driver);
cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
- destroy_workqueue(cxl_pmem_wq);
}
+MODULE_DESCRIPTION("CXL PMEM: Persistent Memory Support");
MODULE_LICENSE("GPL v2");
module_init(cxl_pmem_init);
module_exit(cxl_pmem_exit);
-MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS("CXL");
MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
+MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);
diff --git a/drivers/cxl/pmu.h b/drivers/cxl/pmu.h
new file mode 100644
index 000000000000..b1e9bcd9f28c
--- /dev/null
+++ b/drivers/cxl/pmu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2023 Huawei
+ * CXL Specification rev 3.0 Setion 8.2.7 (CPMU Register Interface)
+ */
+#ifndef CXL_PMU_H
+#define CXL_PMU_H
+#include <linux/device.h>
+
+enum cxl_pmu_type {
+ CXL_PMU_MEMDEV,
+};
+
+#define CXL_PMU_REGMAP_SIZE 0xe00 /* Table 8-32 CXL 3.0 specification */
+struct cxl_pmu {
+ struct device dev;
+ void __iomem *base;
+ int assoc_id;
+ int index;
+ enum cxl_pmu_type type;
+};
+
+#define to_cxl_pmu(dev) container_of(dev, struct cxl_pmu, dev)
+struct cxl_pmu_regs;
+int devm_cxl_pmu_add(struct device *parent, struct cxl_pmu_regs *regs,
+ int assoc_id, int idx, enum cxl_pmu_type type);
+
+#endif
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
new file mode 100644
index 000000000000..51c8f2f84717
--- /dev/null
+++ b/drivers/cxl/port.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "cxlmem.h"
+#include "cxlpci.h"
+
+/**
+ * DOC: cxl port
+ *
+ * The port driver enumerates dport via PCI and scans for HDM
+ * (Host-managed-Device-Memory) decoder resources via the
+ * @component_reg_phys value passed in by the agent that registered the
+ * port. All descendant ports of a CXL root port (described by platform
+ * firmware) are managed in this drivers context. Each driver instance
+ * is responsible for tearing down the driver context of immediate
+ * descendant ports. The locking for this is validated by
+ * CONFIG_PROVE_CXL_LOCKING.
+ *
+ * The primary service this driver provides is presenting APIs to other
+ * drivers to utilize the decoders, and indicating to userspace (via bind
+ * status) the connectivity of the CXL.mem protocol throughout the
+ * PCIe topology.
+ */
+
+static void schedule_detach(void *cxlmd)
+{
+ schedule_cxl_memdev_detach(cxlmd);
+}
+
+static int discover_region(struct device *dev, void *unused)
+{
+ struct cxl_endpoint_decoder *cxled;
+ int rc;
+
+ if (!is_endpoint_decoder(dev))
+ return 0;
+
+ cxled = to_cxl_endpoint_decoder(dev);
+ if ((cxled->cxld.flags & CXL_DECODER_F_ENABLE) == 0)
+ return 0;
+
+ if (cxled->state != CXL_DECODER_STATE_AUTO)
+ return 0;
+
+ /*
+ * Region enumeration is opportunistic, if this add-event fails,
+ * continue to the next endpoint decoder.
+ */
+ rc = cxl_add_to_region(cxled);
+ if (rc)
+ dev_dbg(dev, "failed to add to region: %#llx-%#llx\n",
+ cxled->cxld.hpa_range.start, cxled->cxld.hpa_range.end);
+
+ return 0;
+}
+
+static int cxl_switch_port_probe(struct cxl_port *port)
+{
+ /* Reset nr_dports for rebind of driver */
+ port->nr_dports = 0;
+
+ /* Cache the data early to ensure is_visible() works */
+ read_cdat_data(port);
+
+ return 0;
+}
+
+static int cxl_endpoint_port_probe(struct cxl_port *port)
+{
+ struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
+ int rc;
+
+ /* Cache the data early to ensure is_visible() works */
+ read_cdat_data(port);
+ cxl_endpoint_parse_cdat(port);
+
+ get_device(&cxlmd->dev);
+ rc = devm_add_action_or_reset(&port->dev, schedule_detach, cxlmd);
+ if (rc)
+ return rc;
+
+ rc = devm_cxl_endpoint_decoders_setup(port);
+ if (rc)
+ return rc;
+
+ /*
+ * Now that all endpoint decoders are successfully enumerated, try to
+ * assemble regions from committed decoders
+ */
+ device_for_each_child(&port->dev, NULL, discover_region);
+
+ return 0;
+}
+
+static int cxl_port_probe(struct device *dev)
+{
+ struct cxl_port *port = to_cxl_port(dev);
+
+ if (is_cxl_endpoint(port))
+ return cxl_endpoint_port_probe(port);
+ return cxl_switch_port_probe(port);
+}
+
+static ssize_t CDAT_read(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *bin_attr, char *buf,
+ loff_t offset, size_t count)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_port *port = to_cxl_port(dev);
+
+ if (!port->cdat_available)
+ return -ENXIO;
+
+ if (!port->cdat.table)
+ return 0;
+
+ return memory_read_from_buffer(buf, count, &offset,
+ port->cdat.table,
+ port->cdat.length);
+}
+
+static const BIN_ATTR_ADMIN_RO(CDAT, 0);
+
+static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj,
+ const struct bin_attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_port *port = to_cxl_port(dev);
+
+ if ((attr == &bin_attr_CDAT) && port->cdat_available)
+ return attr->attr.mode;
+
+ return 0;
+}
+
+static const struct bin_attribute *const cxl_cdat_bin_attributes[] = {
+ &bin_attr_CDAT,
+ NULL,
+};
+
+static const struct attribute_group cxl_cdat_attribute_group = {
+ .bin_attrs = cxl_cdat_bin_attributes,
+ .is_bin_visible = cxl_port_bin_attr_is_visible,
+};
+
+static const struct attribute_group *cxl_port_attribute_groups[] = {
+ &cxl_cdat_attribute_group,
+ NULL,
+};
+
+static struct cxl_driver cxl_port_driver = {
+ .name = "cxl_port",
+ .probe = cxl_port_probe,
+ .id = CXL_DEVICE_PORT,
+ .drv = {
+ .dev_groups = cxl_port_attribute_groups,
+ },
+};
+
+static int __init cxl_port_init(void)
+{
+ return cxl_driver_register(&cxl_port_driver);
+}
+/*
+ * Be ready to immediately enable ports emitted by the platform CXL root
+ * (e.g. cxl_acpi) when CONFIG_CXL_PORT=y.
+ */
+subsys_initcall(cxl_port_init);
+
+static void __exit cxl_port_exit(void)
+{
+ cxl_driver_unregister(&cxl_port_driver);
+}
+module_exit(cxl_port_exit);
+
+MODULE_DESCRIPTION("CXL: Port enumeration and services");
+MODULE_LICENSE("GPL v2");
+MODULE_IMPORT_NS("CXL");
+MODULE_ALIAS_CXL(CXL_DEVICE_PORT);
diff --git a/drivers/cxl/security.c b/drivers/cxl/security.c
new file mode 100644
index 000000000000..ab793e8577c7
--- /dev/null
+++ b/drivers/cxl/security.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
+#include <linux/libnvdimm.h>
+#include <linux/unaligned.h>
+#include <linux/module.h>
+#include <linux/async.h>
+#include <linux/slab.h>
+#include <linux/memregion.h>
+#include "cxlmem.h"
+#include "cxl.h"
+
+static unsigned long cxl_pmem_get_security_flags(struct nvdimm *nvdimm,
+ enum nvdimm_passphrase_type ptype)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+ unsigned long security_flags = 0;
+ struct cxl_get_security_output {
+ __le32 flags;
+ } out;
+ struct cxl_mbox_cmd mbox_cmd;
+ u32 sec_out;
+ int rc;
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_GET_SECURITY_STATE,
+ .size_out = sizeof(out),
+ .payload_out = &out,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return 0;
+
+ sec_out = le32_to_cpu(out.flags);
+ /* cache security state */
+ mds->security.state = sec_out;
+
+ if (ptype == NVDIMM_MASTER) {
+ if (sec_out & CXL_PMEM_SEC_STATE_MASTER_PASS_SET)
+ set_bit(NVDIMM_SECURITY_UNLOCKED, &security_flags);
+ else
+ set_bit(NVDIMM_SECURITY_DISABLED, &security_flags);
+ if (sec_out & CXL_PMEM_SEC_STATE_MASTER_PLIMIT)
+ set_bit(NVDIMM_SECURITY_FROZEN, &security_flags);
+ return security_flags;
+ }
+
+ if (sec_out & CXL_PMEM_SEC_STATE_USER_PASS_SET) {
+ if (sec_out & CXL_PMEM_SEC_STATE_FROZEN ||
+ sec_out & CXL_PMEM_SEC_STATE_USER_PLIMIT)
+ set_bit(NVDIMM_SECURITY_FROZEN, &security_flags);
+
+ if (sec_out & CXL_PMEM_SEC_STATE_LOCKED)
+ set_bit(NVDIMM_SECURITY_LOCKED, &security_flags);
+ else
+ set_bit(NVDIMM_SECURITY_UNLOCKED, &security_flags);
+ } else {
+ set_bit(NVDIMM_SECURITY_DISABLED, &security_flags);
+ }
+
+ return security_flags;
+}
+
+static int cxl_pmem_security_change_key(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *old_data,
+ const struct nvdimm_key_data *new_data,
+ enum nvdimm_passphrase_type ptype)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_set_pass set_pass;
+
+ set_pass = (struct cxl_set_pass) {
+ .type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+ CXL_PMEM_SEC_PASS_USER,
+ };
+ memcpy(set_pass.old_pass, old_data->data, NVDIMM_PASSPHRASE_LEN);
+ memcpy(set_pass.new_pass, new_data->data, NVDIMM_PASSPHRASE_LEN);
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_SET_PASSPHRASE,
+ .size_in = sizeof(set_pass),
+ .payload_in = &set_pass,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+static int __cxl_pmem_security_disable(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key_data,
+ enum nvdimm_passphrase_type ptype)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_disable_pass dis_pass;
+ struct cxl_mbox_cmd mbox_cmd;
+
+ dis_pass = (struct cxl_disable_pass) {
+ .type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+ CXL_PMEM_SEC_PASS_USER,
+ };
+ memcpy(dis_pass.pass, key_data->data, NVDIMM_PASSPHRASE_LEN);
+
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_DISABLE_PASSPHRASE,
+ .size_in = sizeof(dis_pass),
+ .payload_in = &dis_pass,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+static int cxl_pmem_security_disable(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key_data)
+{
+ return __cxl_pmem_security_disable(nvdimm, key_data, NVDIMM_USER);
+}
+
+static int cxl_pmem_security_disable_master(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key_data)
+{
+ return __cxl_pmem_security_disable(nvdimm, key_data, NVDIMM_MASTER);
+}
+
+static int cxl_pmem_security_freeze(struct nvdimm *nvdimm)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd = {
+ .opcode = CXL_MBOX_OP_FREEZE_SECURITY,
+ };
+
+ return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+}
+
+static int cxl_pmem_security_unlock(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key_data)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ u8 pass[NVDIMM_PASSPHRASE_LEN];
+ struct cxl_mbox_cmd mbox_cmd;
+ int rc;
+
+ memcpy(pass, key_data->data, NVDIMM_PASSPHRASE_LEN);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_UNLOCK,
+ .size_in = NVDIMM_PASSPHRASE_LEN,
+ .payload_in = pass,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return rc;
+
+ return 0;
+}
+
+static int cxl_pmem_security_passphrase_erase(struct nvdimm *nvdimm,
+ const struct nvdimm_key_data *key,
+ enum nvdimm_passphrase_type ptype)
+{
+ struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
+ struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
+ struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox;
+ struct cxl_mbox_cmd mbox_cmd;
+ struct cxl_pass_erase erase;
+ int rc;
+
+ erase = (struct cxl_pass_erase) {
+ .type = ptype == NVDIMM_MASTER ? CXL_PMEM_SEC_PASS_MASTER :
+ CXL_PMEM_SEC_PASS_USER,
+ };
+ memcpy(erase.pass, key->data, NVDIMM_PASSPHRASE_LEN);
+ mbox_cmd = (struct cxl_mbox_cmd) {
+ .opcode = CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE,
+ .size_in = sizeof(erase),
+ .payload_in = &erase,
+ };
+
+ rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
+ if (rc < 0)
+ return rc;
+
+ return 0;
+}
+
+static const struct nvdimm_security_ops __cxl_security_ops = {
+ .get_flags = cxl_pmem_get_security_flags,
+ .change_key = cxl_pmem_security_change_key,
+ .disable = cxl_pmem_security_disable,
+ .freeze = cxl_pmem_security_freeze,
+ .unlock = cxl_pmem_security_unlock,
+ .erase = cxl_pmem_security_passphrase_erase,
+ .disable_master = cxl_pmem_security_disable_master,
+};
+
+const struct nvdimm_security_ops *cxl_security_ops = &__cxl_security_ops;