diff options
Diffstat (limited to 'drivers/cxl/core')
| -rw-r--r-- | drivers/cxl/core/Makefile | 22 | ||||
| -rw-r--r-- | drivers/cxl/core/cdat.c | 1074 | ||||
| -rw-r--r-- | drivers/cxl/core/core.h | 169 | ||||
| -rw-r--r-- | drivers/cxl/core/edac.c | 2109 | ||||
| -rw-r--r-- | drivers/cxl/core/features.c | 706 | ||||
| -rw-r--r-- | drivers/cxl/core/hdm.c | 1287 | ||||
| -rw-r--r-- | drivers/cxl/core/mbox.c | 1552 | ||||
| -rw-r--r-- | drivers/cxl/core/mce.c | 65 | ||||
| -rw-r--r-- | drivers/cxl/core/mce.h | 20 | ||||
| -rw-r--r-- | drivers/cxl/core/memdev.c | 1161 | ||||
| -rw-r--r-- | drivers/cxl/core/pci.c | 1189 | ||||
| -rw-r--r-- | drivers/cxl/core/pmem.c | 287 | ||||
| -rw-r--r-- | drivers/cxl/core/pmu.c | 68 | ||||
| -rw-r--r-- | drivers/cxl/core/port.c | 2532 | ||||
| -rw-r--r-- | drivers/cxl/core/ras.c | 126 | ||||
| -rw-r--r-- | drivers/cxl/core/region.c | 4003 | ||||
| -rw-r--r-- | drivers/cxl/core/regs.c | 644 | ||||
| -rw-r--r-- | drivers/cxl/core/suspend.c | 24 | ||||
| -rw-r--r-- | drivers/cxl/core/trace.c | 8 | ||||
| -rw-r--r-- | drivers/cxl/core/trace.h | 1105 |
20 files changed, 18151 insertions, 0 deletions
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile new file mode 100644 index 000000000000..5ad8fef210b5 --- /dev/null +++ b/drivers/cxl/core/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_CXL_BUS) += cxl_core.o +obj-$(CONFIG_CXL_SUSPEND) += suspend.o + +ccflags-y += -I$(srctree)/drivers/cxl +CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src) + +cxl_core-y := port.o +cxl_core-y += pmem.o +cxl_core-y += regs.o +cxl_core-y += memdev.o +cxl_core-y += mbox.o +cxl_core-y += pci.o +cxl_core-y += hdm.o +cxl_core-y += pmu.o +cxl_core-y += cdat.o +cxl_core-y += ras.o +cxl_core-$(CONFIG_TRACING) += trace.o +cxl_core-$(CONFIG_CXL_REGION) += region.o +cxl_core-$(CONFIG_CXL_MCE) += mce.o +cxl_core-$(CONFIG_CXL_FEATURES) += features.o +cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c new file mode 100644 index 000000000000..7120b5f2e31f --- /dev/null +++ b/drivers/cxl/core/cdat.c @@ -0,0 +1,1074 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Intel Corporation. All rights reserved. */ +#include <linux/acpi.h> +#include <linux/xarray.h> +#include <linux/fw_table.h> +#include <linux/node.h> +#include <linux/overflow.h> +#include "cxlpci.h" +#include "cxlmem.h" +#include "core.h" +#include "cxl.h" + +struct dsmas_entry { + struct range dpa_range; + u8 handle; + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate cdat_coord[ACCESS_COORDINATE_MAX]; + int entries; + int qos_class; +}; + +static u32 cdat_normalize(u16 entry, u64 base, u8 type) +{ + u32 value; + + /* + * Check for invalid and overflow values + */ + if (entry == 0xffff || !entry) + return 0; + if (base > (UINT_MAX / (entry))) + return 0; + + /* + * CDAT fields follow the format of HMAT fields. See table 5 Device + * Scoped Latency and Bandwidth Information Structure in Coherent Device + * Attribute Table (CDAT) Specification v1.01. + */ + value = entry * base; + switch (type) { + case ACPI_HMAT_ACCESS_LATENCY: + case ACPI_HMAT_READ_LATENCY: + case ACPI_HMAT_WRITE_LATENCY: + value = DIV_ROUND_UP(value, 1000); + break; + default: + break; + } + return value; +} + +static int cdat_dsmas_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_header *hdr = &header->cdat; + struct acpi_cdat_dsmas *dsmas; + int size = sizeof(*hdr) + sizeof(*dsmas); + struct xarray *dsmas_xa = arg; + struct dsmas_entry *dent; + u16 len; + int rc; + + len = le16_to_cpu((__force __le16)hdr->length); + if (len != size || (unsigned long)hdr + len > end) { + pr_warn("Malformed DSMAS table length: (%u:%u)\n", size, len); + return -EINVAL; + } + + /* Skip common header */ + dsmas = (struct acpi_cdat_dsmas *)(hdr + 1); + + dent = kzalloc(sizeof(*dent), GFP_KERNEL); + if (!dent) + return -ENOMEM; + + dent->handle = dsmas->dsmad_handle; + dent->dpa_range.start = le64_to_cpu((__force __le64)dsmas->dpa_base_address); + dent->dpa_range.end = le64_to_cpu((__force __le64)dsmas->dpa_base_address) + + le64_to_cpu((__force __le64)dsmas->dpa_length) - 1; + + rc = xa_insert(dsmas_xa, dent->handle, dent, GFP_KERNEL); + if (rc) { + kfree(dent); + return rc; + } + + return 0; +} + +static void __cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + switch (access) { + case ACPI_HMAT_ACCESS_LATENCY: + coord->read_latency = val; + coord->write_latency = val; + break; + case ACPI_HMAT_READ_LATENCY: + coord->read_latency = val; + break; + case ACPI_HMAT_WRITE_LATENCY: + coord->write_latency = val; + break; + case ACPI_HMAT_ACCESS_BANDWIDTH: + coord->read_bandwidth = val; + coord->write_bandwidth = val; + break; + case ACPI_HMAT_READ_BANDWIDTH: + coord->read_bandwidth = val; + break; + case ACPI_HMAT_WRITE_BANDWIDTH: + coord->write_bandwidth = val; + break; + } +} + +static void cxl_access_coordinate_set(struct access_coordinate *coord, + int access, unsigned int val) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_access_coordinate_set(&coord[i], access, val); +} + +static int cdat_dslbis_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_header *hdr = &header->cdat; + struct acpi_cdat_dslbis *dslbis; + int size = sizeof(*hdr) + sizeof(*dslbis); + struct xarray *dsmas_xa = arg; + struct dsmas_entry *dent; + __le64 le_base; + __le16 le_val; + u64 val; + u16 len; + + len = le16_to_cpu((__force __le16)hdr->length); + if (len != size || (unsigned long)hdr + len > end) { + pr_warn("Malformed DSLBIS table length: (%u:%u)\n", size, len); + return -EINVAL; + } + + /* Skip common header */ + dslbis = (struct acpi_cdat_dslbis *)(hdr + 1); + + /* Skip unrecognized data type */ + if (dslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH) + return 0; + + /* Not a memory type, skip */ + if ((dslbis->flags & ACPI_HMAT_MEMORY_HIERARCHY) != ACPI_HMAT_MEMORY) + return 0; + + dent = xa_load(dsmas_xa, dslbis->handle); + if (!dent) { + pr_warn("No matching DSMAS entry for DSLBIS entry.\n"); + return 0; + } + + le_base = (__force __le64)dslbis->entry_base_unit; + le_val = (__force __le16)dslbis->entry[0]; + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + dslbis->data_type); + + cxl_access_coordinate_set(dent->cdat_coord, dslbis->data_type, val); + + return 0; +} + +static int cdat_table_parse_output(int rc) +{ + if (rc < 0) + return rc; + if (rc == 0) + return -ENOENT; + + return 0; +} + +static int cxl_cdat_endpoint_process(struct cxl_port *port, + struct xarray *dsmas_xa) +{ + int rc; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_DSMAS, cdat_dsmas_handler, + dsmas_xa, port->cdat.table, port->cdat.length); + rc = cdat_table_parse_output(rc); + if (rc) + return rc; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_DSLBIS, cdat_dslbis_handler, + dsmas_xa, port->cdat.table, port->cdat.length); + return cdat_table_parse_output(rc); +} + +static int cxl_port_perf_data_calculate(struct cxl_port *port, + struct xarray *dsmas_xa) +{ + struct access_coordinate ep_c[ACCESS_COORDINATE_MAX]; + struct dsmas_entry *dent; + int valid_entries = 0; + unsigned long index; + int rc; + + rc = cxl_endpoint_get_perf_coordinates(port, ep_c); + if (rc) { + dev_dbg(&port->dev, "Failed to retrieve ep perf coordinates.\n"); + return rc; + } + + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + + if (!cxl_root) + return -ENODEV; + + if (!cxl_root->ops || !cxl_root->ops->qos_class) + return -EOPNOTSUPP; + + xa_for_each(dsmas_xa, index, dent) { + int qos_class; + + cxl_coordinates_combine(dent->coord, dent->cdat_coord, ep_c); + dent->entries = 1; + rc = cxl_root->ops->qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); + if (rc != 1) + continue; + + valid_entries++; + dent->qos_class = qos_class; + } + + if (!valid_entries) + return -ENOENT; + + return 0; +} + +static void update_perf_entry(struct device *dev, struct dsmas_entry *dent, + struct cxl_dpa_perf *dpa_perf) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + dpa_perf->coord[i] = dent->coord[i]; + dpa_perf->cdat_coord[i] = dent->cdat_coord[i]; + } + dpa_perf->dpa_range = dent->dpa_range; + dpa_perf->qos_class = dent->qos_class; + dev_dbg(dev, + "DSMAS: dpa: %pra qos: %d read_bw: %d write_bw %d read_lat: %d write_lat: %d\n", + &dent->dpa_range, dpa_perf->qos_class, + dent->coord[ACCESS_COORDINATE_CPU].read_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].write_bandwidth, + dent->coord[ACCESS_COORDINATE_CPU].read_latency, + dent->coord[ACCESS_COORDINATE_CPU].write_latency); +} + +static void cxl_memdev_set_qos_class(struct cxl_dev_state *cxlds, + struct xarray *dsmas_xa) +{ + struct device *dev = cxlds->dev; + struct dsmas_entry *dent; + unsigned long index; + + xa_for_each(dsmas_xa, index, dent) { + bool found = false; + + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct resource *res = &cxlds->part[i].res; + struct range range = { + .start = res->start, + .end = res->end, + }; + + if (range_contains(&range, &dent->dpa_range)) { + update_perf_entry(dev, dent, + &cxlds->part[i].perf); + found = true; + break; + } + } + + if (!found) + dev_dbg(dev, "no partition for dsmas dpa: %pra\n", + &dent->dpa_range); + } +} + +static int match_cxlrd_qos_class(struct device *dev, void *data) +{ + int dev_qos_class = *(int *)data; + struct cxl_root_decoder *cxlrd; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + if (cxlrd->qos_class == CXL_QOS_CLASS_INVALID) + return 0; + + if (cxlrd->qos_class == dev_qos_class) + return 1; + + return 0; +} + +static void reset_dpa_perf(struct cxl_dpa_perf *dpa_perf) +{ + *dpa_perf = (struct cxl_dpa_perf) { + .qos_class = CXL_QOS_CLASS_INVALID, + }; +} + +static bool cxl_qos_match(struct cxl_port *root_port, + struct cxl_dpa_perf *dpa_perf) +{ + if (dpa_perf->qos_class == CXL_QOS_CLASS_INVALID) + return false; + + if (!device_for_each_child(&root_port->dev, &dpa_perf->qos_class, + match_cxlrd_qos_class)) + return false; + + return true; +} + +static int match_cxlrd_hb(struct device *dev, void *data) +{ + struct device *host_bridge = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_root_decoder *cxlrd; + + if (!is_root_decoder(dev)) + return 0; + + cxlrd = to_cxl_root_decoder(dev); + cxlsd = &cxlrd->cxlsd; + + guard(rwsem_read)(&cxl_rwsem.region); + for (int i = 0; i < cxlsd->nr_targets; i++) { + if (cxlsd->target[i] && host_bridge == cxlsd->target[i]->dport_dev) + return 1; + } + + return 0; +} + +static void cxl_qos_class_verify(struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_port *root_port; + + struct cxl_root *cxl_root __free(put_cxl_root) = + find_cxl_root(cxlmd->endpoint); + + /* + * No need to reset_dpa_perf() here as find_cxl_root() is guaranteed to + * succeed when called in the cxl_endpoint_port_probe() path. + */ + if (!cxl_root) + return; + + root_port = &cxl_root->port; + + /* + * Save userspace from needing to check if a qos class has any matches + * by hiding qos class info if the memdev is not mapped by a root + * decoder, or the partition class does not match any root decoder + * class. + */ + if (!device_for_each_child(&root_port->dev, + cxlmd->endpoint->host_bridge, + match_cxlrd_hb)) { + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + + reset_dpa_perf(perf); + } + return; + } + + for (int i = 0; i < cxlds->nr_partitions; i++) { + struct cxl_dpa_perf *perf = &cxlds->part[i].perf; + + if (!cxl_qos_match(root_port, perf)) + reset_dpa_perf(perf); + } +} + +static void discard_dsmas(struct xarray *xa) +{ + unsigned long index; + void *ent; + + xa_for_each(xa, index, ent) { + xa_erase(xa, index); + kfree(ent); + } + xa_destroy(xa); +} +DEFINE_FREE(dsmas, struct xarray *, if (_T) discard_dsmas(_T)) + +void cxl_endpoint_parse_cdat(struct cxl_port *port) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct xarray __dsmas_xa; + struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa; + int rc; + + xa_init(&__dsmas_xa); + if (!port->cdat.table) + return; + + rc = cxl_cdat_endpoint_process(port, dsmas_xa); + if (rc < 0) { + dev_dbg(&port->dev, "Failed to parse CDAT: %d\n", rc); + return; + } + + rc = cxl_port_perf_data_calculate(port, dsmas_xa); + if (rc) { + dev_dbg(&port->dev, "Failed to do perf coord calculations.\n"); + return; + } + + cxl_memdev_set_qos_class(cxlds, dsmas_xa); + cxl_qos_class_verify(cxlmd); + cxl_memdev_update_perf(cxlmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, "CXL"); + +static int cdat_sslbis_handler(union acpi_subtable_headers *header, void *arg, + const unsigned long end) +{ + struct acpi_cdat_sslbis_table { + struct acpi_cdat_header header; + struct acpi_cdat_sslbis sslbis_header; + struct acpi_cdat_sslbe entries[]; + } *tbl = (struct acpi_cdat_sslbis_table *)header; + int size = sizeof(header->cdat) + sizeof(tbl->sslbis_header); + struct acpi_cdat_sslbis *sslbis; + struct cxl_dport *dport = arg; + struct device *dev = &dport->port->dev; + int remain, entries, i; + u16 len; + + len = le16_to_cpu((__force __le16)header->cdat.length); + remain = len - size; + if (!remain || remain % sizeof(tbl->entries[0]) || + (unsigned long)header + len > end) { + dev_warn(dev, "Malformed SSLBIS table length: (%u)\n", len); + return -EINVAL; + } + + sslbis = &tbl->sslbis_header; + /* Unrecognized data type, we can skip */ + if (sslbis->data_type > ACPI_HMAT_WRITE_BANDWIDTH) + return 0; + + entries = remain / sizeof(tbl->entries[0]); + if (struct_size(tbl, entries, entries) != len) + return -EINVAL; + + for (i = 0; i < entries; i++) { + u16 x = le16_to_cpu((__force __le16)tbl->entries[i].portx_id); + u16 y = le16_to_cpu((__force __le16)tbl->entries[i].porty_id); + __le64 le_base; + __le16 le_val; + u16 dsp_id; + u64 val; + + switch (x) { + case ACPI_CDAT_SSLBIS_US_PORT: + dsp_id = y; + break; + case ACPI_CDAT_SSLBIS_ANY_PORT: + switch (y) { + case ACPI_CDAT_SSLBIS_US_PORT: + dsp_id = x; + break; + case ACPI_CDAT_SSLBIS_ANY_PORT: + dsp_id = ACPI_CDAT_SSLBIS_ANY_PORT; + break; + default: + dsp_id = y; + break; + } + break; + default: + dsp_id = x; + break; + } + + le_base = (__force __le64)tbl->sslbis_header.entry_base_unit; + le_val = (__force __le16)tbl->entries[i].latency_or_bandwidth; + val = cdat_normalize(le16_to_cpu(le_val), le64_to_cpu(le_base), + sslbis->data_type); + + if (dsp_id == ACPI_CDAT_SSLBIS_ANY_PORT || + dsp_id == dport->port_id) { + cxl_access_coordinate_set(dport->coord, + sslbis->data_type, val); + return 0; + } + } + + return 0; +} + +void cxl_switch_parse_cdat(struct cxl_dport *dport) +{ + struct cxl_port *port = dport->port; + int rc; + + if (!port->cdat.table) + return; + + rc = cdat_table_parse(ACPI_CDAT_TYPE_SSLBIS, cdat_sslbis_handler, + dport, port->cdat.table, port->cdat.length); + rc = cdat_table_parse_output(rc); + if (rc) + dev_dbg(&port->dev, "Failed to parse SSLBIS: %d\n", rc); +} +EXPORT_SYMBOL_NS_GPL(cxl_switch_parse_cdat, "CXL"); + +static void __cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + if (c1->write_bandwidth && c2->write_bandwidth) + out->write_bandwidth = min(c1->write_bandwidth, + c2->write_bandwidth); + out->write_latency = c1->write_latency + c2->write_latency; + + if (c1->read_bandwidth && c2->read_bandwidth) + out->read_bandwidth = min(c1->read_bandwidth, + c2->read_bandwidth); + out->read_latency = c1->read_latency + c2->read_latency; +} + +/** + * cxl_coordinates_combine - Combine the two input coordinates + * + * @out: Output coordinate of c1 and c2 combined + * @c1: input coordinates + * @c2: input coordinates + */ +void cxl_coordinates_combine(struct access_coordinate *out, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + __cxl_coordinates_combine(&out[i], &c1[i], &c2[i]); +} + +MODULE_IMPORT_NS("CXL"); + +static void cxl_bandwidth_add(struct access_coordinate *coord, + struct access_coordinate *c1, + struct access_coordinate *c2) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + coord[i].read_bandwidth = c1[i].read_bandwidth + + c2[i].read_bandwidth; + coord[i].write_bandwidth = c1[i].write_bandwidth + + c2[i].write_bandwidth; + } +} + +static bool dpa_perf_contains(struct cxl_dpa_perf *perf, + struct resource *dpa_res) +{ + struct range dpa = { + .start = dpa_res->start, + .end = dpa_res->end, + }; + + return range_contains(&perf->dpa_range, &dpa); +} + +static struct cxl_dpa_perf *cxled_get_dpa_perf(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_dpa_perf *perf; + + if (cxled->part < 0) + return ERR_PTR(-EINVAL); + perf = &cxlds->part[cxled->part].perf; + + if (!perf) + return ERR_PTR(-EINVAL); + + if (!dpa_perf_contains(perf, cxled->dpa_res)) + return ERR_PTR(-EINVAL); + + return perf; +} + +/* + * Transient context for containing the current calculation of bandwidth when + * doing walking the port hierarchy to deal with shared upstream link. + */ +struct cxl_perf_ctx { + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct cxl_port *port; +}; + +/** + * cxl_endpoint_gather_bandwidth - collect all the endpoint bandwidth in an xarray + * @cxlr: CXL region for the bandwidth calculation + * @cxled: endpoint decoder to start on + * @usp_xa: (output) the xarray that collects all the bandwidth coordinates + * indexed by the upstream device with data of 'struct cxl_perf_ctx'. + * @gp_is_root: (output) bool of whether the grandparent is cxl root. + * + * Return: 0 for success or -errno + * + * Collects aggregated endpoint bandwidth and store the bandwidth in + * an xarray indexed by the upstream device of the switch or the RP + * device. Each endpoint consists the minimum of the bandwidth from DSLBIS + * from the endpoint CDAT, the endpoint upstream link bandwidth, and the + * bandwidth from the SSLBIS of the switch CDAT for the switch upstream port to + * the downstream port that's associated with the endpoint. If the + * device is directly connected to a RP, then no SSLBIS is involved. + */ +static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + struct xarray *usp_xa, + bool *gp_is_root) +{ + struct cxl_port *endpoint = to_cxl_port(cxled->cxld.dev.parent); + struct cxl_port *parent_port = to_cxl_port(endpoint->dev.parent); + struct cxl_port *gp_port = to_cxl_port(parent_port->dev.parent); + struct access_coordinate pci_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate sw_coord[ACCESS_COORDINATE_MAX]; + struct access_coordinate ep_coord[ACCESS_COORDINATE_MAX]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct cxl_perf_ctx *perf_ctx; + struct cxl_dpa_perf *perf; + unsigned long index; + void *ptr; + int rc; + + if (!dev_is_pci(cxlds->dev)) + return -ENODEV; + + if (cxlds->rcd) + return -ENODEV; + + perf = cxled_get_dpa_perf(cxled); + if (IS_ERR(perf)) + return PTR_ERR(perf); + + *gp_is_root = is_cxl_root(gp_port); + + /* + * If the grandparent is cxl root, then index is the root port, + * otherwise it's the parent switch upstream device. + */ + if (*gp_is_root) + index = (unsigned long)endpoint->parent_dport->dport_dev; + else + index = (unsigned long)parent_port->uport_dev; + + perf_ctx = xa_load(usp_xa, index); + if (!perf_ctx) { + struct cxl_perf_ctx *c __free(kfree) = + kzalloc(sizeof(*perf_ctx), GFP_KERNEL); + + if (!c) + return -ENOMEM; + ptr = xa_store(usp_xa, index, c, GFP_KERNEL); + if (xa_is_err(ptr)) + return xa_err(ptr); + perf_ctx = no_free_ptr(c); + perf_ctx->port = parent_port; + } + + /* Direct upstream link from EP bandwidth */ + rc = cxl_pci_get_bandwidth(pdev, pci_coord); + if (rc < 0) + return rc; + + /* + * Min of upstream link bandwidth and Endpoint CDAT bandwidth from + * DSLBIS. + */ + cxl_coordinates_combine(ep_coord, pci_coord, perf->cdat_coord); + + /* + * If grandparent port is root, then there's no switch involved and + * the endpoint is connected to a root port. + */ + if (!*gp_is_root) { + /* + * Retrieve the switch SSLBIS for switch downstream port + * associated with the endpoint bandwidth. + */ + rc = cxl_port_get_switch_dport_bandwidth(endpoint, sw_coord); + if (rc) + return rc; + + /* + * Min of the earlier coordinates with the switch SSLBIS + * bandwidth + */ + cxl_coordinates_combine(ep_coord, ep_coord, sw_coord); + } + + /* + * Aggregate the computed bandwidth with the current aggregated bandwidth + * of the endpoints with the same switch upstream device or RP. + */ + cxl_bandwidth_add(perf_ctx->coord, perf_ctx->coord, ep_coord); + + return 0; +} + +static void free_perf_xa(struct xarray *xa) +{ + struct cxl_perf_ctx *ctx; + unsigned long index; + + if (!xa) + return; + + xa_for_each(xa, index, ctx) + kfree(ctx); + xa_destroy(xa); + kfree(xa); +} +DEFINE_FREE(free_perf_xa, struct xarray *, if (_T) free_perf_xa(_T)) + +/** + * cxl_switch_gather_bandwidth - collect all the bandwidth at switch level in an xarray + * @cxlr: The region being operated on + * @input_xa: xarray indexed by upstream device of a switch with data of 'struct + * cxl_perf_ctx' + * @gp_is_root: (output) bool of whether the grandparent is cxl root. + * + * Return: a xarray of resulting cxl_perf_ctx per parent switch or root port + * or ERR_PTR(-errno) + * + * Iterate through the xarray. Take the minimum of the downstream calculated + * bandwidth, the upstream link bandwidth, and the SSLBIS of the upstream + * switch if exists. Sum the resulting bandwidth under the switch upstream + * device or a RP device. The function can be iterated over multiple switches + * if the switches are present. + */ +static struct xarray *cxl_switch_gather_bandwidth(struct cxl_region *cxlr, + struct xarray *input_xa, + bool *gp_is_root) +{ + struct xarray *res_xa __free(free_perf_xa) = + kzalloc(sizeof(*res_xa), GFP_KERNEL); + struct access_coordinate coords[ACCESS_COORDINATE_MAX]; + struct cxl_perf_ctx *ctx, *us_ctx; + unsigned long index, us_index; + int dev_count = 0; + int gp_count = 0; + void *ptr; + int rc; + + if (!res_xa) + return ERR_PTR(-ENOMEM); + xa_init(res_xa); + + xa_for_each(input_xa, index, ctx) { + struct device *dev = (struct device *)index; + struct cxl_port *port = ctx->port; + struct cxl_port *parent_port = to_cxl_port(port->dev.parent); + struct cxl_port *gp_port = to_cxl_port(parent_port->dev.parent); + struct cxl_dport *dport = port->parent_dport; + bool is_root = false; + + dev_count++; + if (is_cxl_root(gp_port)) { + is_root = true; + gp_count++; + } + + /* + * If the grandparent is cxl root, then index is the root port, + * otherwise it's the parent switch upstream device. + */ + if (is_root) + us_index = (unsigned long)port->parent_dport->dport_dev; + else + us_index = (unsigned long)parent_port->uport_dev; + + us_ctx = xa_load(res_xa, us_index); + if (!us_ctx) { + struct cxl_perf_ctx *n __free(kfree) = + kzalloc(sizeof(*n), GFP_KERNEL); + + if (!n) + return ERR_PTR(-ENOMEM); + + ptr = xa_store(res_xa, us_index, n, GFP_KERNEL); + if (xa_is_err(ptr)) + return ERR_PTR(xa_err(ptr)); + us_ctx = no_free_ptr(n); + us_ctx->port = parent_port; + } + + /* + * If the device isn't an upstream PCIe port, there's something + * wrong with the topology. + */ + if (!dev_is_pci(dev)) + return ERR_PTR(-EINVAL); + + /* Retrieve the upstream link bandwidth */ + rc = cxl_pci_get_bandwidth(to_pci_dev(dev), coords); + if (rc) + return ERR_PTR(-ENXIO); + + /* + * Take the min of downstream bandwidth and the upstream link + * bandwidth. + */ + cxl_coordinates_combine(coords, coords, ctx->coord); + + /* + * Take the min of the calculated bandwidth and the upstream + * switch SSLBIS bandwidth if there's a parent switch + */ + if (!is_root) + cxl_coordinates_combine(coords, coords, dport->coord); + + /* + * Aggregate the calculated bandwidth common to an upstream + * switch. + */ + cxl_bandwidth_add(us_ctx->coord, us_ctx->coord, coords); + } + + /* Asymmetric topology detected. */ + if (gp_count) { + if (gp_count != dev_count) { + dev_dbg(&cxlr->dev, + "Asymmetric hierarchy detected, bandwidth not updated\n"); + return ERR_PTR(-EOPNOTSUPP); + } + *gp_is_root = true; + } + + return no_free_ptr(res_xa); +} + +/** + * cxl_rp_gather_bandwidth - handle the root port level bandwidth collection + * @xa: the xarray that holds the cxl_perf_ctx that has the bandwidth calculated + * below each root port device. + * + * Return: xarray that holds cxl_perf_ctx per host bridge or ERR_PTR(-errno) + */ +static struct xarray *cxl_rp_gather_bandwidth(struct xarray *xa) +{ + struct xarray *hb_xa __free(free_perf_xa) = + kzalloc(sizeof(*hb_xa), GFP_KERNEL); + struct cxl_perf_ctx *ctx; + unsigned long index; + + if (!hb_xa) + return ERR_PTR(-ENOMEM); + xa_init(hb_xa); + + xa_for_each(xa, index, ctx) { + struct cxl_port *port = ctx->port; + unsigned long hb_index = (unsigned long)port->uport_dev; + struct cxl_perf_ctx *hb_ctx; + void *ptr; + + hb_ctx = xa_load(hb_xa, hb_index); + if (!hb_ctx) { + struct cxl_perf_ctx *n __free(kfree) = + kzalloc(sizeof(*n), GFP_KERNEL); + + if (!n) + return ERR_PTR(-ENOMEM); + ptr = xa_store(hb_xa, hb_index, n, GFP_KERNEL); + if (xa_is_err(ptr)) + return ERR_PTR(xa_err(ptr)); + hb_ctx = no_free_ptr(n); + hb_ctx->port = port; + } + + cxl_bandwidth_add(hb_ctx->coord, hb_ctx->coord, ctx->coord); + } + + return no_free_ptr(hb_xa); +} + +/** + * cxl_hb_gather_bandwidth - handle the host bridge level bandwidth collection + * @xa: the xarray that holds the cxl_perf_ctx that has the bandwidth calculated + * below each host bridge. + * + * Return: xarray that holds cxl_perf_ctx per ACPI0017 device or ERR_PTR(-errno) + */ +static struct xarray *cxl_hb_gather_bandwidth(struct xarray *xa) +{ + struct xarray *mw_xa __free(free_perf_xa) = + kzalloc(sizeof(*mw_xa), GFP_KERNEL); + struct cxl_perf_ctx *ctx; + unsigned long index; + + if (!mw_xa) + return ERR_PTR(-ENOMEM); + xa_init(mw_xa); + + xa_for_each(xa, index, ctx) { + struct cxl_port *port = ctx->port; + struct cxl_port *parent_port; + struct cxl_perf_ctx *mw_ctx; + struct cxl_dport *dport; + unsigned long mw_index; + void *ptr; + + parent_port = to_cxl_port(port->dev.parent); + mw_index = (unsigned long)parent_port->uport_dev; + + mw_ctx = xa_load(mw_xa, mw_index); + if (!mw_ctx) { + struct cxl_perf_ctx *n __free(kfree) = + kzalloc(sizeof(*n), GFP_KERNEL); + + if (!n) + return ERR_PTR(-ENOMEM); + ptr = xa_store(mw_xa, mw_index, n, GFP_KERNEL); + if (xa_is_err(ptr)) + return ERR_PTR(xa_err(ptr)); + mw_ctx = no_free_ptr(n); + } + + dport = port->parent_dport; + cxl_coordinates_combine(ctx->coord, ctx->coord, dport->coord); + cxl_bandwidth_add(mw_ctx->coord, mw_ctx->coord, ctx->coord); + } + + return no_free_ptr(mw_xa); +} + +/** + * cxl_region_update_bandwidth - Update the bandwidth access coordinates of a region + * @cxlr: The region being operated on + * @input_xa: xarray holds cxl_perf_ctx with calculated bandwidth per ACPI0017 instance + */ +static void cxl_region_update_bandwidth(struct cxl_region *cxlr, + struct xarray *input_xa) +{ + struct access_coordinate coord[ACCESS_COORDINATE_MAX]; + struct cxl_perf_ctx *ctx; + unsigned long index; + + memset(coord, 0, sizeof(coord)); + xa_for_each(input_xa, index, ctx) + cxl_bandwidth_add(coord, coord, ctx->coord); + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + cxlr->coord[i].read_bandwidth = coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth = coord[i].write_bandwidth; + } +} + +/** + * cxl_region_shared_upstream_bandwidth_update - Recalculate the bandwidth for + * the region + * @cxlr: the cxl region to recalculate + * + * The function walks the topology from bottom up and calculates the bandwidth. It + * starts at the endpoints, processes at the switches if any, processes at the rootport + * level, at the host bridge level, and finally aggregates at the region. + */ +void cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr) +{ + struct xarray *working_xa; + int root_count = 0; + bool is_root; + int rc; + + lockdep_assert_held(&cxl_rwsem.dpa); + + struct xarray *usp_xa __free(free_perf_xa) = + kzalloc(sizeof(*usp_xa), GFP_KERNEL); + + if (!usp_xa) + return; + + xa_init(usp_xa); + + /* Collect bandwidth data from all the endpoints. */ + for (int i = 0; i < cxlr->params.nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = cxlr->params.targets[i]; + + is_root = false; + rc = cxl_endpoint_gather_bandwidth(cxlr, cxled, usp_xa, &is_root); + if (rc) + return; + root_count += is_root; + } + + /* Detect asymmetric hierarchy with some direct attached endpoints. */ + if (root_count && root_count != cxlr->params.nr_targets) { + dev_dbg(&cxlr->dev, + "Asymmetric hierarchy detected, bandwidth not updated\n"); + return; + } + + /* + * Walk up one or more switches to deal with the bandwidth of the + * switches if they exist. Endpoints directly attached to RPs skip + * over this part. + */ + if (!root_count) { + do { + working_xa = cxl_switch_gather_bandwidth(cxlr, usp_xa, + &is_root); + if (IS_ERR(working_xa)) + return; + free_perf_xa(usp_xa); + usp_xa = working_xa; + } while (!is_root); + } + + /* Handle the bandwidth at the root port of the hierarchy */ + working_xa = cxl_rp_gather_bandwidth(usp_xa); + if (IS_ERR(working_xa)) + return; + free_perf_xa(usp_xa); + usp_xa = working_xa; + + /* Handle the bandwidth at the host bridge of the hierarchy */ + working_xa = cxl_hb_gather_bandwidth(usp_xa); + if (IS_ERR(working_xa)) + return; + free_perf_xa(usp_xa); + usp_xa = working_xa; + + /* + * Aggregate all the bandwidth collected per CFMWS (ACPI0017) and + * update the region bandwidth with the final calculated values. + */ + cxl_region_update_bandwidth(cxlr, usp_xa); +} + +void cxl_region_perf_data_calculate(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_dpa_perf *perf; + + lockdep_assert_held(&cxl_rwsem.dpa); + + perf = cxled_get_dpa_perf(cxled); + if (IS_ERR(perf)) + return; + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + /* Get total bandwidth and the worst latency for the cxl region */ + cxlr->coord[i].read_latency = max_t(unsigned int, + cxlr->coord[i].read_latency, + perf->coord[i].read_latency); + cxlr->coord[i].write_latency = max_t(unsigned int, + cxlr->coord[i].write_latency, + perf->coord[i].write_latency); + cxlr->coord[i].read_bandwidth += perf->coord[i].read_bandwidth; + cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; + } +} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h new file mode 100644 index 000000000000..1fb66132b777 --- /dev/null +++ b/drivers/cxl/core/core.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef __CXL_CORE_H__ +#define __CXL_CORE_H__ + +#include <cxl/mailbox.h> +#include <linux/rwsem.h> + +extern const struct device_type cxl_nvdimm_bridge_type; +extern const struct device_type cxl_nvdimm_type; +extern const struct device_type cxl_pmu_type; + +extern struct attribute_group cxl_base_attribute_group; + +enum cxl_detach_mode { + DETACH_ONLY, + DETACH_INVALIDATE, +}; + +#ifdef CONFIG_CXL_REGION +extern struct device_attribute dev_attr_create_pmem_region; +extern struct device_attribute dev_attr_create_ram_region; +extern struct device_attribute dev_attr_delete_region; +extern struct device_attribute dev_attr_region; +extern const struct device_type cxl_pmem_region_type; +extern const struct device_type cxl_dax_region_type; +extern const struct device_type cxl_region_type; + +int cxl_decoder_detach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos, + enum cxl_detach_mode mode); + +#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) +#define CXL_REGION_TYPE(x) (&cxl_region_type) +#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), +#define CXL_PMEM_REGION_TYPE(x) (&cxl_pmem_region_type) +#define CXL_DAX_REGION_TYPE(x) (&cxl_dax_region_type) +int cxl_region_init(void); +void cxl_region_exit(void); +int cxl_get_poison_by_endpoint(struct cxl_port *port); +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa); +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa); + +#else +static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, + const struct cxl_memdev *cxlmd, u64 dpa) +{ + return ULLONG_MAX; +} +static inline +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) +{ + return NULL; +} +static inline int cxl_get_poison_by_endpoint(struct cxl_port *port) +{ + return 0; +} +static inline int cxl_decoder_detach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + int pos, enum cxl_detach_mode mode) +{ + return 0; +} +static inline int cxl_region_init(void) +{ + return 0; +} +static inline void cxl_region_exit(void) +{ +} +#define CXL_REGION_ATTR(x) NULL +#define CXL_REGION_TYPE(x) NULL +#define SET_CXL_REGION_ATTR(x) +#define CXL_PMEM_REGION_TYPE(x) NULL +#define CXL_DAX_REGION_TYPE(x) NULL +#endif + +struct cxl_send_command; +struct cxl_mem_query_commands; +int cxl_query_cmd(struct cxl_mailbox *cxl_mbox, + struct cxl_mem_query_commands __user *q); +int cxl_send_cmd(struct cxl_mailbox *cxl_mbox, struct cxl_send_command __user *s); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); + +struct dentry *cxl_debugfs_create_dir(const char *dir); +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode); +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled); +bool cxl_resource_contains_addr(const struct resource *res, const resource_size_t addr); + +enum cxl_rcrb { + CXL_RCRB_DOWNSTREAM, + CXL_RCRB_UPSTREAM, +}; +struct cxl_rcrb_info; +resource_size_t __rcrb_to_component(struct device *dev, + struct cxl_rcrb_info *ri, + enum cxl_rcrb which); +u16 cxl_rcrb_to_aer(struct device *dev, resource_size_t rcrb); + +#define PCI_RCRB_CAP_LIST_ID_MASK GENMASK(7, 0) +#define PCI_RCRB_CAP_HDR_ID_MASK GENMASK(7, 0) +#define PCI_RCRB_CAP_HDR_NEXT_MASK GENMASK(15, 8) +#define PCI_CAP_EXP_SIZEOF 0x3c + +struct cxl_rwsem { + /* + * All changes to HPA (interleave configuration) occur with this + * lock held for write. + */ + struct rw_semaphore region; + /* + * All changes to a device DPA space occur with this lock held + * for write. + */ + struct rw_semaphore dpa; +}; + +extern struct cxl_rwsem cxl_rwsem; + +int cxl_memdev_init(void); +void cxl_memdev_exit(void); +void cxl_mbox_init(void); + +enum cxl_poison_trace_type { + CXL_POISON_TRACE_LIST, + CXL_POISON_TRACE_INJECT, + CXL_POISON_TRACE_CLEAR, +}; + +enum poison_cmd_enabled_bits; +bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, + enum poison_cmd_enabled_bits cmd); + +long cxl_pci_get_latency(struct pci_dev *pdev); +int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); +int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, + struct access_coordinate *c); + +int cxl_ras_init(void); +void cxl_ras_exit(void); +int cxl_gpf_port_setup(struct cxl_dport *dport); + +struct cxl_hdm; +int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info); +int cxl_port_get_possible_dports(struct cxl_port *port); + +#ifdef CONFIG_CXL_FEATURES +struct cxl_feat_entry * +cxl_feature_info(struct cxl_features_state *cxlfs, const uuid_t *uuid); +size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, + enum cxl_get_feat_selection selection, + void *feat_out, size_t feat_out_size, u16 offset, + u16 *return_code); +int cxl_set_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, + u8 feat_version, const void *feat_data, + size_t feat_data_size, u32 feat_flag, u16 offset, + u16 *return_code); +#endif + +#endif /* __CXL_CORE_H__ */ diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c new file mode 100644 index 000000000000..79994ca9bc9f --- /dev/null +++ b/drivers/cxl/core/edac.c @@ -0,0 +1,2109 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * CXL EDAC memory feature driver. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + * + * - Supports functions to configure EDAC features of the + * CXL memory devices. + * - Registers with the EDAC device subsystem driver to expose + * the features sysfs attributes to the user for configuring + * CXL memory RAS feature. + */ + +#include <linux/cleanup.h> +#include <linux/edac.h> +#include <linux/limits.h> +#include <linux/unaligned.h> +#include <linux/xarray.h> +#include <cxl/features.h> +#include <cxl.h> +#include <cxlmem.h> +#include "core.h" +#include "trace.h" + +#define CXL_NR_EDAC_DEV_FEATURES 7 + +#define CXL_SCRUB_NO_REGION -1 + +struct cxl_patrol_scrub_context { + u8 instance; + u16 get_feat_size; + u16 set_feat_size; + u8 get_version; + u8 set_version; + u16 effects; + struct cxl_memdev *cxlmd; + struct cxl_region *cxlr; +}; + +/* + * See CXL spec rev 3.2 @8.2.10.9.11.1 Table 8-222 Device Patrol Scrub Control + * Feature Readable Attributes. + */ +struct cxl_scrub_rd_attrbs { + u8 scrub_cycle_cap; + __le16 scrub_cycle_hours; + u8 scrub_flags; +} __packed; + +/* + * See CXL spec rev 3.2 @8.2.10.9.11.1 Table 8-223 Device Patrol Scrub Control + * Feature Writable Attributes. + */ +struct cxl_scrub_wr_attrbs { + u8 scrub_cycle_hours; + u8 scrub_flags; +} __packed; + +#define CXL_SCRUB_CONTROL_CHANGEABLE BIT(0) +#define CXL_SCRUB_CONTROL_REALTIME BIT(1) +#define CXL_SCRUB_CONTROL_CYCLE_MASK GENMASK(7, 0) +#define CXL_SCRUB_CONTROL_MIN_CYCLE_MASK GENMASK(15, 8) +#define CXL_SCRUB_CONTROL_ENABLE BIT(0) + +#define CXL_GET_SCRUB_CYCLE_CHANGEABLE(cap) \ + FIELD_GET(CXL_SCRUB_CONTROL_CHANGEABLE, cap) +#define CXL_GET_SCRUB_CYCLE(cycle) \ + FIELD_GET(CXL_SCRUB_CONTROL_CYCLE_MASK, cycle) +#define CXL_GET_SCRUB_MIN_CYCLE(cycle) \ + FIELD_GET(CXL_SCRUB_CONTROL_MIN_CYCLE_MASK, cycle) +#define CXL_GET_SCRUB_EN_STS(flags) FIELD_GET(CXL_SCRUB_CONTROL_ENABLE, flags) + +#define CXL_SET_SCRUB_CYCLE(cycle) \ + FIELD_PREP(CXL_SCRUB_CONTROL_CYCLE_MASK, cycle) +#define CXL_SET_SCRUB_EN(en) FIELD_PREP(CXL_SCRUB_CONTROL_ENABLE, en) + +static int cxl_mem_scrub_get_attrbs(struct cxl_mailbox *cxl_mbox, u8 *cap, + u16 *cycle, u8 *flags, u8 *min_cycle) +{ + size_t rd_data_size = sizeof(struct cxl_scrub_rd_attrbs); + size_t data_size; + struct cxl_scrub_rd_attrbs *rd_attrbs __free(kfree) = + kzalloc(rd_data_size, GFP_KERNEL); + if (!rd_attrbs) + return -ENOMEM; + + data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID, + CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs, + rd_data_size, 0, NULL); + if (!data_size) + return -EIO; + + *cap = rd_attrbs->scrub_cycle_cap; + *cycle = le16_to_cpu(rd_attrbs->scrub_cycle_hours); + *flags = rd_attrbs->scrub_flags; + if (min_cycle) + *min_cycle = CXL_GET_SCRUB_MIN_CYCLE(*cycle); + + return 0; +} + +static int cxl_scrub_get_attrbs(struct cxl_patrol_scrub_context *cxl_ps_ctx, + u8 *cap, u16 *cycle, u8 *flags, u8 *min_cycle) +{ + struct cxl_mailbox *cxl_mbox; + struct cxl_region_params *p; + struct cxl_memdev *cxlmd; + struct cxl_region *cxlr; + u8 min_scrub_cycle = 0; + int i, ret; + + if (!cxl_ps_ctx->cxlr) { + cxl_mbox = &cxl_ps_ctx->cxlmd->cxlds->cxl_mbox; + return cxl_mem_scrub_get_attrbs(cxl_mbox, cap, cycle, + flags, min_cycle); + } + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return ret; + + cxlr = cxl_ps_ctx->cxlr; + p = &cxlr->params; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + + cxlmd = cxled_to_memdev(cxled); + cxl_mbox = &cxlmd->cxlds->cxl_mbox; + ret = cxl_mem_scrub_get_attrbs(cxl_mbox, cap, cycle, flags, + min_cycle); + if (ret) + return ret; + + /* + * The min_scrub_cycle of a region is the max of minimum scrub + * cycles supported by memdevs that back the region. + */ + if (min_cycle) + min_scrub_cycle = max(*min_cycle, min_scrub_cycle); + } + + if (min_cycle) + *min_cycle = min_scrub_cycle; + + return 0; +} + +static int cxl_scrub_set_attrbs_region(struct device *dev, + struct cxl_patrol_scrub_context *cxl_ps_ctx, + u8 cycle, u8 flags) +{ + struct cxl_scrub_wr_attrbs wr_attrbs; + struct cxl_mailbox *cxl_mbox; + struct cxl_region_params *p; + struct cxl_memdev *cxlmd; + struct cxl_region *cxlr; + int ret, i; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return ret; + + cxlr = cxl_ps_ctx->cxlr; + p = &cxlr->params; + wr_attrbs.scrub_cycle_hours = cycle; + wr_attrbs.scrub_flags = flags; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + + cxlmd = cxled_to_memdev(cxled); + cxl_mbox = &cxlmd->cxlds->cxl_mbox; + ret = cxl_set_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID, + cxl_ps_ctx->set_version, &wr_attrbs, + sizeof(wr_attrbs), + CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET, + 0, NULL); + if (ret) + return ret; + + if (cycle != cxlmd->scrub_cycle) { + if (cxlmd->scrub_region_id != CXL_SCRUB_NO_REGION) + dev_info(dev, + "Device scrub rate(%d hours) set by region%d rate overwritten by region%d scrub rate(%d hours)\n", + cxlmd->scrub_cycle, + cxlmd->scrub_region_id, cxlr->id, + cycle); + + cxlmd->scrub_cycle = cycle; + cxlmd->scrub_region_id = cxlr->id; + } + } + + return 0; +} + +static int cxl_scrub_set_attrbs_device(struct device *dev, + struct cxl_patrol_scrub_context *cxl_ps_ctx, + u8 cycle, u8 flags) +{ + struct cxl_scrub_wr_attrbs wr_attrbs; + struct cxl_mailbox *cxl_mbox; + struct cxl_memdev *cxlmd; + int ret; + + wr_attrbs.scrub_cycle_hours = cycle; + wr_attrbs.scrub_flags = flags; + + cxlmd = cxl_ps_ctx->cxlmd; + cxl_mbox = &cxlmd->cxlds->cxl_mbox; + ret = cxl_set_feature(cxl_mbox, &CXL_FEAT_PATROL_SCRUB_UUID, + cxl_ps_ctx->set_version, &wr_attrbs, + sizeof(wr_attrbs), + CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET, 0, + NULL); + if (ret) + return ret; + + if (cycle != cxlmd->scrub_cycle) { + if (cxlmd->scrub_region_id != CXL_SCRUB_NO_REGION) + dev_info(dev, + "Device scrub rate(%d hours) set by region%d rate overwritten with device local scrub rate(%d hours)\n", + cxlmd->scrub_cycle, cxlmd->scrub_region_id, + cycle); + + cxlmd->scrub_cycle = cycle; + cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION; + } + + return 0; +} + +static int cxl_scrub_set_attrbs(struct device *dev, + struct cxl_patrol_scrub_context *cxl_ps_ctx, + u8 cycle, u8 flags) +{ + if (cxl_ps_ctx->cxlr) + return cxl_scrub_set_attrbs_region(dev, cxl_ps_ctx, cycle, flags); + + return cxl_scrub_set_attrbs_device(dev, cxl_ps_ctx, cycle, flags); +} + +static int cxl_patrol_scrub_get_enabled_bg(struct device *dev, void *drv_data, + bool *enabled) +{ + struct cxl_patrol_scrub_context *ctx = drv_data; + u8 cap, flags; + u16 cycle; + int ret; + + ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, NULL); + if (ret) + return ret; + + *enabled = CXL_GET_SCRUB_EN_STS(flags); + + return 0; +} + +static int cxl_patrol_scrub_set_enabled_bg(struct device *dev, void *drv_data, + bool enable) +{ + struct cxl_patrol_scrub_context *ctx = drv_data; + u8 cap, flags, wr_cycle; + u16 rd_cycle; + int ret; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + ret = cxl_scrub_get_attrbs(ctx, &cap, &rd_cycle, &flags, NULL); + if (ret) + return ret; + + wr_cycle = CXL_GET_SCRUB_CYCLE(rd_cycle); + flags = CXL_SET_SCRUB_EN(enable); + + return cxl_scrub_set_attrbs(dev, ctx, wr_cycle, flags); +} + +static int cxl_patrol_scrub_get_min_scrub_cycle(struct device *dev, + void *drv_data, u32 *min) +{ + struct cxl_patrol_scrub_context *ctx = drv_data; + u8 cap, flags, min_cycle; + u16 cycle; + int ret; + + ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, &min_cycle); + if (ret) + return ret; + + *min = min_cycle * 3600; + + return 0; +} + +static int cxl_patrol_scrub_get_max_scrub_cycle(struct device *dev, + void *drv_data, u32 *max) +{ + *max = U8_MAX * 3600; /* Max set by register size */ + + return 0; +} + +static int cxl_patrol_scrub_get_scrub_cycle(struct device *dev, void *drv_data, + u32 *scrub_cycle_secs) +{ + struct cxl_patrol_scrub_context *ctx = drv_data; + u8 cap, flags; + u16 cycle; + int ret; + + ret = cxl_scrub_get_attrbs(ctx, &cap, &cycle, &flags, NULL); + if (ret) + return ret; + + *scrub_cycle_secs = CXL_GET_SCRUB_CYCLE(cycle) * 3600; + + return 0; +} + +static int cxl_patrol_scrub_set_scrub_cycle(struct device *dev, void *drv_data, + u32 scrub_cycle_secs) +{ + struct cxl_patrol_scrub_context *ctx = drv_data; + u8 scrub_cycle_hours = scrub_cycle_secs / 3600; + u8 cap, wr_cycle, flags, min_cycle; + u16 rd_cycle; + int ret; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + ret = cxl_scrub_get_attrbs(ctx, &cap, &rd_cycle, &flags, &min_cycle); + if (ret) + return ret; + + if (!CXL_GET_SCRUB_CYCLE_CHANGEABLE(cap)) + return -EOPNOTSUPP; + + if (scrub_cycle_hours < min_cycle) { + dev_dbg(dev, "Invalid CXL patrol scrub cycle(%d) to set\n", + scrub_cycle_hours); + dev_dbg(dev, + "Minimum supported CXL patrol scrub cycle in hour %d\n", + min_cycle); + return -EINVAL; + } + wr_cycle = CXL_SET_SCRUB_CYCLE(scrub_cycle_hours); + + return cxl_scrub_set_attrbs(dev, ctx, wr_cycle, flags); +} + +static const struct edac_scrub_ops cxl_ps_scrub_ops = { + .get_enabled_bg = cxl_patrol_scrub_get_enabled_bg, + .set_enabled_bg = cxl_patrol_scrub_set_enabled_bg, + .get_min_cycle = cxl_patrol_scrub_get_min_scrub_cycle, + .get_max_cycle = cxl_patrol_scrub_get_max_scrub_cycle, + .get_cycle_duration = cxl_patrol_scrub_get_scrub_cycle, + .set_cycle_duration = cxl_patrol_scrub_set_scrub_cycle, +}; + +static int cxl_memdev_scrub_init(struct cxl_memdev *cxlmd, + struct edac_dev_feature *ras_feature, + u8 scrub_inst) +{ + struct cxl_patrol_scrub_context *cxl_ps_ctx; + struct cxl_feat_entry *feat_entry; + u8 cap, flags; + u16 cycle; + int rc; + + feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds), + &CXL_FEAT_PATROL_SCRUB_UUID); + if (IS_ERR(feat_entry)) + return -EOPNOTSUPP; + + if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE)) + return -EOPNOTSUPP; + + cxl_ps_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ps_ctx), GFP_KERNEL); + if (!cxl_ps_ctx) + return -ENOMEM; + + *cxl_ps_ctx = (struct cxl_patrol_scrub_context){ + .get_feat_size = le16_to_cpu(feat_entry->get_feat_size), + .set_feat_size = le16_to_cpu(feat_entry->set_feat_size), + .get_version = feat_entry->get_feat_ver, + .set_version = feat_entry->set_feat_ver, + .effects = le16_to_cpu(feat_entry->effects), + .instance = scrub_inst, + .cxlmd = cxlmd, + }; + + rc = cxl_mem_scrub_get_attrbs(&cxlmd->cxlds->cxl_mbox, &cap, &cycle, + &flags, NULL); + if (rc) + return rc; + + cxlmd->scrub_cycle = CXL_GET_SCRUB_CYCLE(cycle); + cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION; + + ras_feature->ft_type = RAS_FEAT_SCRUB; + ras_feature->instance = cxl_ps_ctx->instance; + ras_feature->scrub_ops = &cxl_ps_scrub_ops; + ras_feature->ctx = cxl_ps_ctx; + + return 0; +} + +static int cxl_region_scrub_init(struct cxl_region *cxlr, + struct edac_dev_feature *ras_feature, + u8 scrub_inst) +{ + struct cxl_patrol_scrub_context *cxl_ps_ctx; + struct cxl_region_params *p = &cxlr->params; + struct cxl_feat_entry *feat_entry = NULL; + struct cxl_memdev *cxlmd; + u8 cap, flags; + u16 cycle; + int i, rc; + + /* + * The cxl_region_rwsem must be held if the code below is used in a context + * other than when the region is in the probe state, as shown here. + */ + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + + cxlmd = cxled_to_memdev(cxled); + feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds), + &CXL_FEAT_PATROL_SCRUB_UUID); + if (IS_ERR(feat_entry)) + return -EOPNOTSUPP; + + if (!(le32_to_cpu(feat_entry->flags) & + CXL_FEATURE_F_CHANGEABLE)) + return -EOPNOTSUPP; + + rc = cxl_mem_scrub_get_attrbs(&cxlmd->cxlds->cxl_mbox, &cap, + &cycle, &flags, NULL); + if (rc) + return rc; + + cxlmd->scrub_cycle = CXL_GET_SCRUB_CYCLE(cycle); + cxlmd->scrub_region_id = CXL_SCRUB_NO_REGION; + } + + cxl_ps_ctx = devm_kzalloc(&cxlr->dev, sizeof(*cxl_ps_ctx), GFP_KERNEL); + if (!cxl_ps_ctx) + return -ENOMEM; + + *cxl_ps_ctx = (struct cxl_patrol_scrub_context){ + .get_feat_size = le16_to_cpu(feat_entry->get_feat_size), + .set_feat_size = le16_to_cpu(feat_entry->set_feat_size), + .get_version = feat_entry->get_feat_ver, + .set_version = feat_entry->set_feat_ver, + .effects = le16_to_cpu(feat_entry->effects), + .instance = scrub_inst, + .cxlr = cxlr, + }; + + ras_feature->ft_type = RAS_FEAT_SCRUB; + ras_feature->instance = cxl_ps_ctx->instance; + ras_feature->scrub_ops = &cxl_ps_scrub_ops; + ras_feature->ctx = cxl_ps_ctx; + + return 0; +} + +struct cxl_ecs_context { + u16 num_media_frus; + u16 get_feat_size; + u16 set_feat_size; + u8 get_version; + u8 set_version; + u16 effects; + struct cxl_memdev *cxlmd; +}; + +/* + * See CXL spec rev 3.2 @8.2.10.9.11.2 Table 8-225 DDR5 ECS Control Feature + * Readable Attributes. + */ +struct cxl_ecs_fru_rd_attrbs { + u8 ecs_cap; + __le16 ecs_config; + u8 ecs_flags; +} __packed; + +struct cxl_ecs_rd_attrbs { + u8 ecs_log_cap; + struct cxl_ecs_fru_rd_attrbs fru_attrbs[]; +} __packed; + +/* + * See CXL spec rev 3.2 @8.2.10.9.11.2 Table 8-226 DDR5 ECS Control Feature + * Writable Attributes. + */ +struct cxl_ecs_fru_wr_attrbs { + __le16 ecs_config; +} __packed; + +struct cxl_ecs_wr_attrbs { + u8 ecs_log_cap; + struct cxl_ecs_fru_wr_attrbs fru_attrbs[]; +} __packed; + +#define CXL_ECS_LOG_ENTRY_TYPE_MASK GENMASK(1, 0) +#define CXL_ECS_REALTIME_REPORT_CAP_MASK BIT(0) +#define CXL_ECS_THRESHOLD_COUNT_MASK GENMASK(2, 0) +#define CXL_ECS_COUNT_MODE_MASK BIT(3) +#define CXL_ECS_RESET_COUNTER_MASK BIT(4) +#define CXL_ECS_RESET_COUNTER 1 + +enum { + ECS_THRESHOLD_256 = 256, + ECS_THRESHOLD_1024 = 1024, + ECS_THRESHOLD_4096 = 4096, +}; + +enum { + ECS_THRESHOLD_IDX_256 = 3, + ECS_THRESHOLD_IDX_1024 = 4, + ECS_THRESHOLD_IDX_4096 = 5, +}; + +static const u16 ecs_supp_threshold[] = { + [ECS_THRESHOLD_IDX_256] = 256, + [ECS_THRESHOLD_IDX_1024] = 1024, + [ECS_THRESHOLD_IDX_4096] = 4096, +}; + +enum { + ECS_LOG_ENTRY_TYPE_DRAM = 0x0, + ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU = 0x1, +}; + +enum cxl_ecs_count_mode { + ECS_MODE_COUNTS_ROWS = 0, + ECS_MODE_COUNTS_CODEWORDS = 1, +}; + +static int cxl_mem_ecs_get_attrbs(struct device *dev, + struct cxl_ecs_context *cxl_ecs_ctx, + int fru_id, u8 *log_cap, u16 *config) +{ + struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd; + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + struct cxl_ecs_fru_rd_attrbs *fru_rd_attrbs; + size_t rd_data_size; + size_t data_size; + + rd_data_size = cxl_ecs_ctx->get_feat_size; + + struct cxl_ecs_rd_attrbs *rd_attrbs __free(kvfree) = + kvzalloc(rd_data_size, GFP_KERNEL); + if (!rd_attrbs) + return -ENOMEM; + + data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_ECS_UUID, + CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs, + rd_data_size, 0, NULL); + if (!data_size) + return -EIO; + + fru_rd_attrbs = rd_attrbs->fru_attrbs; + *log_cap = rd_attrbs->ecs_log_cap; + *config = le16_to_cpu(fru_rd_attrbs[fru_id].ecs_config); + + return 0; +} + +static int cxl_mem_ecs_set_attrbs(struct device *dev, + struct cxl_ecs_context *cxl_ecs_ctx, + int fru_id, u8 log_cap, u16 config) +{ + struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd; + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + struct cxl_ecs_fru_rd_attrbs *fru_rd_attrbs; + struct cxl_ecs_fru_wr_attrbs *fru_wr_attrbs; + size_t rd_data_size, wr_data_size; + u16 num_media_frus, count; + size_t data_size; + + num_media_frus = cxl_ecs_ctx->num_media_frus; + rd_data_size = cxl_ecs_ctx->get_feat_size; + wr_data_size = cxl_ecs_ctx->set_feat_size; + struct cxl_ecs_rd_attrbs *rd_attrbs __free(kvfree) = + kvzalloc(rd_data_size, GFP_KERNEL); + if (!rd_attrbs) + return -ENOMEM; + + data_size = cxl_get_feature(cxl_mbox, &CXL_FEAT_ECS_UUID, + CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs, + rd_data_size, 0, NULL); + if (!data_size) + return -EIO; + + struct cxl_ecs_wr_attrbs *wr_attrbs __free(kvfree) = + kvzalloc(wr_data_size, GFP_KERNEL); + if (!wr_attrbs) + return -ENOMEM; + + /* + * Fill writable attributes from the current attributes read + * for all the media FRUs. + */ + fru_rd_attrbs = rd_attrbs->fru_attrbs; + fru_wr_attrbs = wr_attrbs->fru_attrbs; + wr_attrbs->ecs_log_cap = log_cap; + for (count = 0; count < num_media_frus; count++) + fru_wr_attrbs[count].ecs_config = + fru_rd_attrbs[count].ecs_config; + + fru_wr_attrbs[fru_id].ecs_config = cpu_to_le16(config); + + return cxl_set_feature(cxl_mbox, &CXL_FEAT_ECS_UUID, + cxl_ecs_ctx->set_version, wr_attrbs, + wr_data_size, + CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET, + 0, NULL); +} + +static u8 cxl_get_ecs_log_entry_type(u8 log_cap, u16 config) +{ + return FIELD_GET(CXL_ECS_LOG_ENTRY_TYPE_MASK, log_cap); +} + +static u16 cxl_get_ecs_threshold(u8 log_cap, u16 config) +{ + u8 index = FIELD_GET(CXL_ECS_THRESHOLD_COUNT_MASK, config); + + return ecs_supp_threshold[index]; +} + +static u8 cxl_get_ecs_count_mode(u8 log_cap, u16 config) +{ + return FIELD_GET(CXL_ECS_COUNT_MODE_MASK, config); +} + +#define CXL_ECS_GET_ATTR(attrb) \ + static int cxl_ecs_get_##attrb(struct device *dev, void *drv_data, \ + int fru_id, u32 *val) \ + { \ + struct cxl_ecs_context *ctx = drv_data; \ + u8 log_cap; \ + u16 config; \ + int ret; \ + \ + ret = cxl_mem_ecs_get_attrbs(dev, ctx, fru_id, &log_cap, \ + &config); \ + if (ret) \ + return ret; \ + \ + *val = cxl_get_ecs_##attrb(log_cap, config); \ + \ + return 0; \ + } + +CXL_ECS_GET_ATTR(log_entry_type) +CXL_ECS_GET_ATTR(count_mode) +CXL_ECS_GET_ATTR(threshold) + +static int cxl_set_ecs_log_entry_type(struct device *dev, u8 *log_cap, + u16 *config, u32 val) +{ + if (val != ECS_LOG_ENTRY_TYPE_DRAM && + val != ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU) + return -EINVAL; + + *log_cap = FIELD_PREP(CXL_ECS_LOG_ENTRY_TYPE_MASK, val); + + return 0; +} + +static int cxl_set_ecs_threshold(struct device *dev, u8 *log_cap, u16 *config, + u32 val) +{ + *config &= ~CXL_ECS_THRESHOLD_COUNT_MASK; + + switch (val) { + case ECS_THRESHOLD_256: + *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK, + ECS_THRESHOLD_IDX_256); + break; + case ECS_THRESHOLD_1024: + *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK, + ECS_THRESHOLD_IDX_1024); + break; + case ECS_THRESHOLD_4096: + *config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK, + ECS_THRESHOLD_IDX_4096); + break; + default: + dev_dbg(dev, "Invalid CXL ECS threshold count(%u) to set\n", + val); + dev_dbg(dev, "Supported ECS threshold counts: %u, %u, %u\n", + ECS_THRESHOLD_256, ECS_THRESHOLD_1024, + ECS_THRESHOLD_4096); + return -EINVAL; + } + + return 0; +} + +static int cxl_set_ecs_count_mode(struct device *dev, u8 *log_cap, u16 *config, + u32 val) +{ + if (val != ECS_MODE_COUNTS_ROWS && val != ECS_MODE_COUNTS_CODEWORDS) { + dev_dbg(dev, "Invalid CXL ECS scrub mode(%d) to set\n", val); + dev_dbg(dev, + "Supported ECS Modes: 0: ECS counts rows with errors," + " 1: ECS counts codewords with errors\n"); + return -EINVAL; + } + + *config &= ~CXL_ECS_COUNT_MODE_MASK; + *config |= FIELD_PREP(CXL_ECS_COUNT_MODE_MASK, val); + + return 0; +} + +static int cxl_set_ecs_reset_counter(struct device *dev, u8 *log_cap, + u16 *config, u32 val) +{ + if (val != CXL_ECS_RESET_COUNTER) + return -EINVAL; + + *config &= ~CXL_ECS_RESET_COUNTER_MASK; + *config |= FIELD_PREP(CXL_ECS_RESET_COUNTER_MASK, val); + + return 0; +} + +#define CXL_ECS_SET_ATTR(attrb) \ + static int cxl_ecs_set_##attrb(struct device *dev, void *drv_data, \ + int fru_id, u32 val) \ + { \ + struct cxl_ecs_context *ctx = drv_data; \ + u8 log_cap; \ + u16 config; \ + int ret; \ + \ + if (!capable(CAP_SYS_RAWIO)) \ + return -EPERM; \ + \ + ret = cxl_mem_ecs_get_attrbs(dev, ctx, fru_id, &log_cap, \ + &config); \ + if (ret) \ + return ret; \ + \ + ret = cxl_set_ecs_##attrb(dev, &log_cap, &config, val); \ + if (ret) \ + return ret; \ + \ + return cxl_mem_ecs_set_attrbs(dev, ctx, fru_id, log_cap, \ + config); \ + } +CXL_ECS_SET_ATTR(log_entry_type) +CXL_ECS_SET_ATTR(count_mode) +CXL_ECS_SET_ATTR(reset_counter) +CXL_ECS_SET_ATTR(threshold) + +static const struct edac_ecs_ops cxl_ecs_ops = { + .get_log_entry_type = cxl_ecs_get_log_entry_type, + .set_log_entry_type = cxl_ecs_set_log_entry_type, + .get_mode = cxl_ecs_get_count_mode, + .set_mode = cxl_ecs_set_count_mode, + .reset = cxl_ecs_set_reset_counter, + .get_threshold = cxl_ecs_get_threshold, + .set_threshold = cxl_ecs_set_threshold, +}; + +static int cxl_memdev_ecs_init(struct cxl_memdev *cxlmd, + struct edac_dev_feature *ras_feature) +{ + struct cxl_ecs_context *cxl_ecs_ctx; + struct cxl_feat_entry *feat_entry; + int num_media_frus; + + feat_entry = + cxl_feature_info(to_cxlfs(cxlmd->cxlds), &CXL_FEAT_ECS_UUID); + if (IS_ERR(feat_entry)) + return -EOPNOTSUPP; + + if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE)) + return -EOPNOTSUPP; + + num_media_frus = (le16_to_cpu(feat_entry->get_feat_size) - + sizeof(struct cxl_ecs_rd_attrbs)) / + sizeof(struct cxl_ecs_fru_rd_attrbs); + if (!num_media_frus) + return -EOPNOTSUPP; + + cxl_ecs_ctx = + devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ecs_ctx), GFP_KERNEL); + if (!cxl_ecs_ctx) + return -ENOMEM; + + *cxl_ecs_ctx = (struct cxl_ecs_context){ + .get_feat_size = le16_to_cpu(feat_entry->get_feat_size), + .set_feat_size = le16_to_cpu(feat_entry->set_feat_size), + .get_version = feat_entry->get_feat_ver, + .set_version = feat_entry->set_feat_ver, + .effects = le16_to_cpu(feat_entry->effects), + .num_media_frus = num_media_frus, + .cxlmd = cxlmd, + }; + + ras_feature->ft_type = RAS_FEAT_ECS; + ras_feature->ecs_ops = &cxl_ecs_ops; + ras_feature->ctx = cxl_ecs_ctx; + ras_feature->ecs_info.num_media_frus = num_media_frus; + + return 0; +} + +/* + * Perform Maintenance CXL 3.2 Spec 8.2.10.7.1 + */ + +/* + * Perform Maintenance input payload + * CXL rev 3.2 section 8.2.10.7.1 Table 8-117 + */ +struct cxl_mbox_maintenance_hdr { + u8 op_class; + u8 op_subclass; +} __packed; + +static int cxl_perform_maintenance(struct cxl_mailbox *cxl_mbox, u8 class, + u8 subclass, void *data_in, + size_t data_in_size) +{ + struct cxl_memdev_maintenance_pi { + struct cxl_mbox_maintenance_hdr hdr; + u8 data[]; + } __packed; + struct cxl_mbox_cmd mbox_cmd; + size_t hdr_size; + + struct cxl_memdev_maintenance_pi *pi __free(kvfree) = + kvzalloc(cxl_mbox->payload_size, GFP_KERNEL); + if (!pi) + return -ENOMEM; + + pi->hdr.op_class = class; + pi->hdr.op_subclass = subclass; + hdr_size = sizeof(pi->hdr); + /* + * Check minimum mbox payload size is available for + * the maintenance data transfer. + */ + if (hdr_size + data_in_size > cxl_mbox->payload_size) + return -ENOMEM; + + memcpy(pi->data, data_in, data_in_size); + mbox_cmd = (struct cxl_mbox_cmd){ + .opcode = CXL_MBOX_OP_DO_MAINTENANCE, + .size_in = hdr_size + data_in_size, + .payload_in = pi, + }; + + return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); +} + +/* + * Support for finding a memory operation attributes + * are from the current boot or not. + */ + +struct cxl_mem_err_rec { + struct xarray rec_gen_media; + struct xarray rec_dram; +}; + +enum cxl_mem_repair_type { + CXL_PPR, + CXL_CACHELINE_SPARING, + CXL_ROW_SPARING, + CXL_BANK_SPARING, + CXL_RANK_SPARING, + CXL_REPAIR_MAX, +}; + +/** + * struct cxl_mem_repair_attrbs - CXL memory repair attributes + * @dpa: DPA of memory to repair + * @nibble_mask: nibble mask, identifies one or more nibbles on the memory bus + * @row: row of memory to repair + * @column: column of memory to repair + * @channel: channel of memory to repair + * @sub_channel: sub channel of memory to repair + * @rank: rank of memory to repair + * @bank_group: bank group of memory to repair + * @bank: bank of memory to repair + * @repair_type: repair type. For eg. PPR, memory sparing etc. + */ +struct cxl_mem_repair_attrbs { + u64 dpa; + u32 nibble_mask; + u32 row; + u16 column; + u8 channel; + u8 sub_channel; + u8 rank; + u8 bank_group; + u8 bank; + enum cxl_mem_repair_type repair_type; +}; + +static struct cxl_event_gen_media * +cxl_find_rec_gen_media(struct cxl_memdev *cxlmd, + struct cxl_mem_repair_attrbs *attrbs) +{ + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_gen_media *rec; + + if (!array_rec) + return NULL; + + rec = xa_load(&array_rec->rec_gen_media, attrbs->dpa); + if (!rec) + return NULL; + + if (attrbs->repair_type == CXL_PPR) + return rec; + + return NULL; +} + +static struct cxl_event_dram * +cxl_find_rec_dram(struct cxl_memdev *cxlmd, + struct cxl_mem_repair_attrbs *attrbs) +{ + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_dram *rec; + u16 validity_flags; + + if (!array_rec) + return NULL; + + rec = xa_load(&array_rec->rec_dram, attrbs->dpa); + if (!rec) + return NULL; + + validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + if (!(validity_flags & CXL_DER_VALID_CHANNEL) || + !(validity_flags & CXL_DER_VALID_RANK)) + return NULL; + + switch (attrbs->repair_type) { + case CXL_PPR: + if (!(validity_flags & CXL_DER_VALID_NIBBLE) || + get_unaligned_le24(rec->nibble_mask) == attrbs->nibble_mask) + return rec; + break; + case CXL_CACHELINE_SPARING: + if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) || + !(validity_flags & CXL_DER_VALID_BANK) || + !(validity_flags & CXL_DER_VALID_ROW) || + !(validity_flags & CXL_DER_VALID_COLUMN)) + return NULL; + + if (rec->media_hdr.channel == attrbs->channel && + rec->media_hdr.rank == attrbs->rank && + rec->bank_group == attrbs->bank_group && + rec->bank == attrbs->bank && + get_unaligned_le24(rec->row) == attrbs->row && + get_unaligned_le16(rec->column) == attrbs->column && + (!(validity_flags & CXL_DER_VALID_NIBBLE) || + get_unaligned_le24(rec->nibble_mask) == + attrbs->nibble_mask) && + (!(validity_flags & CXL_DER_VALID_SUB_CHANNEL) || + rec->sub_channel == attrbs->sub_channel)) + return rec; + break; + case CXL_ROW_SPARING: + if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) || + !(validity_flags & CXL_DER_VALID_BANK) || + !(validity_flags & CXL_DER_VALID_ROW)) + return NULL; + + if (rec->media_hdr.channel == attrbs->channel && + rec->media_hdr.rank == attrbs->rank && + rec->bank_group == attrbs->bank_group && + rec->bank == attrbs->bank && + get_unaligned_le24(rec->row) == attrbs->row && + (!(validity_flags & CXL_DER_VALID_NIBBLE) || + get_unaligned_le24(rec->nibble_mask) == + attrbs->nibble_mask)) + return rec; + break; + case CXL_BANK_SPARING: + if (!(validity_flags & CXL_DER_VALID_BANK_GROUP) || + !(validity_flags & CXL_DER_VALID_BANK)) + return NULL; + + if (rec->media_hdr.channel == attrbs->channel && + rec->media_hdr.rank == attrbs->rank && + rec->bank_group == attrbs->bank_group && + rec->bank == attrbs->bank && + (!(validity_flags & CXL_DER_VALID_NIBBLE) || + get_unaligned_le24(rec->nibble_mask) == + attrbs->nibble_mask)) + return rec; + break; + case CXL_RANK_SPARING: + if (rec->media_hdr.channel == attrbs->channel && + rec->media_hdr.rank == attrbs->rank && + (!(validity_flags & CXL_DER_VALID_NIBBLE) || + get_unaligned_le24(rec->nibble_mask) == + attrbs->nibble_mask)) + return rec; + break; + default: + return NULL; + } + + return NULL; +} + +#define CXL_MAX_STORAGE_DAYS 10 +#define CXL_MAX_STORAGE_TIME_SECS (CXL_MAX_STORAGE_DAYS * 24 * 60 * 60) + +static void cxl_del_expired_gmedia_recs(struct xarray *rec_xarray, + struct cxl_event_gen_media *cur_rec) +{ + u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp); + struct cxl_event_gen_media *rec; + unsigned long index; + u64 delta_ts_secs; + + xa_for_each(rec_xarray, index, rec) { + delta_ts_secs = (cur_ts - + le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL; + if (delta_ts_secs >= CXL_MAX_STORAGE_TIME_SECS) { + xa_erase(rec_xarray, index); + kfree(rec); + } + } +} + +static void cxl_del_expired_dram_recs(struct xarray *rec_xarray, + struct cxl_event_dram *cur_rec) +{ + u64 cur_ts = le64_to_cpu(cur_rec->media_hdr.hdr.timestamp); + struct cxl_event_dram *rec; + unsigned long index; + u64 delta_secs; + + xa_for_each(rec_xarray, index, rec) { + delta_secs = (cur_ts - + le64_to_cpu(rec->media_hdr.hdr.timestamp)) / 1000000000ULL; + if (delta_secs >= CXL_MAX_STORAGE_TIME_SECS) { + xa_erase(rec_xarray, index); + kfree(rec); + } + } +} + +#define CXL_MAX_REC_STORAGE_COUNT 200 + +static void cxl_del_overflow_old_recs(struct xarray *rec_xarray) +{ + void *err_rec; + unsigned long index, count = 0; + + xa_for_each(rec_xarray, index, err_rec) + count++; + + if (count <= CXL_MAX_REC_STORAGE_COUNT) + return; + + count -= CXL_MAX_REC_STORAGE_COUNT; + xa_for_each(rec_xarray, index, err_rec) { + xa_erase(rec_xarray, index); + kfree(err_rec); + count--; + if (!count) + break; + } +} + +int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt) +{ + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_gen_media *rec; + void *old_rec; + + if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec) + return 0; + + rec = kmemdup(&evt->gen_media, sizeof(*rec), GFP_KERNEL); + if (!rec) + return -ENOMEM; + + old_rec = xa_store(&array_rec->rec_gen_media, + le64_to_cpu(rec->media_hdr.phys_addr), rec, + GFP_KERNEL); + if (xa_is_err(old_rec)) { + kfree(rec); + return xa_err(old_rec); + } + + kfree(old_rec); + + cxl_del_expired_gmedia_recs(&array_rec->rec_gen_media, rec); + cxl_del_overflow_old_recs(&array_rec->rec_gen_media); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_store_rec_gen_media, "CXL"); + +int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt) +{ + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_dram *rec; + void *old_rec; + + if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec) + return 0; + + rec = kmemdup(&evt->dram, sizeof(*rec), GFP_KERNEL); + if (!rec) + return -ENOMEM; + + old_rec = xa_store(&array_rec->rec_dram, + le64_to_cpu(rec->media_hdr.phys_addr), rec, + GFP_KERNEL); + if (xa_is_err(old_rec)) { + kfree(rec); + return xa_err(old_rec); + } + + kfree(old_rec); + + cxl_del_expired_dram_recs(&array_rec->rec_dram, rec); + cxl_del_overflow_old_recs(&array_rec->rec_dram); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_store_rec_dram, "CXL"); + +static bool cxl_is_memdev_memory_online(const struct cxl_memdev *cxlmd) +{ + struct cxl_port *port = cxlmd->endpoint; + + if (port && cxl_num_decoders_committed(port)) + return true; + + return false; +} + +/* + * CXL memory sparing control + */ +enum cxl_mem_sparing_granularity { + CXL_MEM_SPARING_CACHELINE, + CXL_MEM_SPARING_ROW, + CXL_MEM_SPARING_BANK, + CXL_MEM_SPARING_RANK, + CXL_MEM_SPARING_MAX +}; + +struct cxl_mem_sparing_context { + struct cxl_memdev *cxlmd; + uuid_t repair_uuid; + u16 get_feat_size; + u16 set_feat_size; + u16 effects; + u8 instance; + u8 get_version; + u8 set_version; + u8 op_class; + u8 op_subclass; + bool cap_safe_when_in_use; + bool cap_hard_sparing; + bool cap_soft_sparing; + u8 channel; + u8 rank; + u8 bank_group; + u32 nibble_mask; + u64 dpa; + u32 row; + u16 column; + u8 bank; + u8 sub_channel; + enum edac_mem_repair_type repair_type; + bool persist_mode; +}; + +#define CXL_SPARING_RD_CAP_SAFE_IN_USE_MASK BIT(0) +#define CXL_SPARING_RD_CAP_HARD_SPARING_MASK BIT(1) +#define CXL_SPARING_RD_CAP_SOFT_SPARING_MASK BIT(2) + +#define CXL_SPARING_WR_DEVICE_INITIATED_MASK BIT(0) + +#define CXL_SPARING_QUERY_RESOURCE_FLAG BIT(0) +#define CXL_SET_HARD_SPARING_FLAG BIT(1) +#define CXL_SPARING_SUB_CHNL_VALID_FLAG BIT(2) +#define CXL_SPARING_NIB_MASK_VALID_FLAG BIT(3) + +#define CXL_GET_SPARING_SAFE_IN_USE(flags) \ + (FIELD_GET(CXL_SPARING_RD_CAP_SAFE_IN_USE_MASK, \ + flags) ^ 1) +#define CXL_GET_CAP_HARD_SPARING(flags) \ + FIELD_GET(CXL_SPARING_RD_CAP_HARD_SPARING_MASK, \ + flags) +#define CXL_GET_CAP_SOFT_SPARING(flags) \ + FIELD_GET(CXL_SPARING_RD_CAP_SOFT_SPARING_MASK, \ + flags) + +#define CXL_SET_SPARING_QUERY_RESOURCE(val) \ + FIELD_PREP(CXL_SPARING_QUERY_RESOURCE_FLAG, val) +#define CXL_SET_HARD_SPARING(val) \ + FIELD_PREP(CXL_SET_HARD_SPARING_FLAG, val) +#define CXL_SET_SPARING_SUB_CHNL_VALID(val) \ + FIELD_PREP(CXL_SPARING_SUB_CHNL_VALID_FLAG, val) +#define CXL_SET_SPARING_NIB_MASK_VALID(val) \ + FIELD_PREP(CXL_SPARING_NIB_MASK_VALID_FLAG, val) + +/* + * See CXL spec rev 3.2 @8.2.10.7.2.3 Table 8-134 Memory Sparing Feature + * Readable Attributes. + */ +struct cxl_memdev_repair_rd_attrbs_hdr { + u8 max_op_latency; + __le16 op_cap; + __le16 op_mode; + u8 op_class; + u8 op_subclass; + u8 rsvd[9]; +} __packed; + +struct cxl_memdev_sparing_rd_attrbs { + struct cxl_memdev_repair_rd_attrbs_hdr hdr; + u8 rsvd; + __le16 restriction_flags; +} __packed; + +/* + * See CXL spec rev 3.2 @8.2.10.7.1.4 Table 8-120 Memory Sparing Input Payload. + */ +struct cxl_memdev_sparing_in_payload { + u8 flags; + u8 channel; + u8 rank; + u8 nibble_mask[3]; + u8 bank_group; + u8 bank; + u8 row[3]; + __le16 column; + u8 sub_channel; +} __packed; + +static int +cxl_mem_sparing_get_attrbs(struct cxl_mem_sparing_context *cxl_sparing_ctx) +{ + size_t rd_data_size = sizeof(struct cxl_memdev_sparing_rd_attrbs); + struct cxl_memdev *cxlmd = cxl_sparing_ctx->cxlmd; + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + u16 restriction_flags; + size_t data_size; + u16 return_code; + struct cxl_memdev_sparing_rd_attrbs *rd_attrbs __free(kfree) = + kzalloc(rd_data_size, GFP_KERNEL); + if (!rd_attrbs) + return -ENOMEM; + + data_size = cxl_get_feature(cxl_mbox, &cxl_sparing_ctx->repair_uuid, + CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs, + rd_data_size, 0, &return_code); + if (!data_size) + return -EIO; + + cxl_sparing_ctx->op_class = rd_attrbs->hdr.op_class; + cxl_sparing_ctx->op_subclass = rd_attrbs->hdr.op_subclass; + restriction_flags = le16_to_cpu(rd_attrbs->restriction_flags); + cxl_sparing_ctx->cap_safe_when_in_use = + CXL_GET_SPARING_SAFE_IN_USE(restriction_flags); + cxl_sparing_ctx->cap_hard_sparing = + CXL_GET_CAP_HARD_SPARING(restriction_flags); + cxl_sparing_ctx->cap_soft_sparing = + CXL_GET_CAP_SOFT_SPARING(restriction_flags); + + return 0; +} + +static struct cxl_event_dram * +cxl_mem_get_rec_dram(struct cxl_memdev *cxlmd, + struct cxl_mem_sparing_context *ctx) +{ + struct cxl_mem_repair_attrbs attrbs = { 0 }; + + attrbs.dpa = ctx->dpa; + attrbs.channel = ctx->channel; + attrbs.rank = ctx->rank; + attrbs.nibble_mask = ctx->nibble_mask; + switch (ctx->repair_type) { + case EDAC_REPAIR_CACHELINE_SPARING: + attrbs.repair_type = CXL_CACHELINE_SPARING; + attrbs.bank_group = ctx->bank_group; + attrbs.bank = ctx->bank; + attrbs.row = ctx->row; + attrbs.column = ctx->column; + attrbs.sub_channel = ctx->sub_channel; + break; + case EDAC_REPAIR_ROW_SPARING: + attrbs.repair_type = CXL_ROW_SPARING; + attrbs.bank_group = ctx->bank_group; + attrbs.bank = ctx->bank; + attrbs.row = ctx->row; + break; + case EDAC_REPAIR_BANK_SPARING: + attrbs.repair_type = CXL_BANK_SPARING; + attrbs.bank_group = ctx->bank_group; + attrbs.bank = ctx->bank; + break; + case EDAC_REPAIR_RANK_SPARING: + attrbs.repair_type = CXL_RANK_SPARING; + break; + default: + return NULL; + } + + return cxl_find_rec_dram(cxlmd, &attrbs); +} + +static int +cxl_mem_perform_sparing(struct device *dev, + struct cxl_mem_sparing_context *cxl_sparing_ctx) +{ + struct cxl_memdev *cxlmd = cxl_sparing_ctx->cxlmd; + struct cxl_memdev_sparing_in_payload sparing_pi; + struct cxl_event_dram *rec = NULL; + u16 validity_flags = 0; + int ret; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return ret; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return ret; + + if (!cxl_sparing_ctx->cap_safe_when_in_use) { + /* Memory to repair must be offline */ + if (cxl_is_memdev_memory_online(cxlmd)) + return -EBUSY; + } else { + if (cxl_is_memdev_memory_online(cxlmd)) { + rec = cxl_mem_get_rec_dram(cxlmd, cxl_sparing_ctx); + if (!rec) + return -EINVAL; + + if (!get_unaligned_le16(rec->media_hdr.validity_flags)) + return -EINVAL; + } + } + + memset(&sparing_pi, 0, sizeof(sparing_pi)); + sparing_pi.flags = CXL_SET_SPARING_QUERY_RESOURCE(0); + if (cxl_sparing_ctx->persist_mode) + sparing_pi.flags |= CXL_SET_HARD_SPARING(1); + + if (rec) + validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + + switch (cxl_sparing_ctx->repair_type) { + case EDAC_REPAIR_CACHELINE_SPARING: + sparing_pi.column = cpu_to_le16(cxl_sparing_ctx->column); + if (!rec || (validity_flags & CXL_DER_VALID_SUB_CHANNEL)) { + sparing_pi.flags |= CXL_SET_SPARING_SUB_CHNL_VALID(1); + sparing_pi.sub_channel = cxl_sparing_ctx->sub_channel; + } + fallthrough; + case EDAC_REPAIR_ROW_SPARING: + put_unaligned_le24(cxl_sparing_ctx->row, sparing_pi.row); + fallthrough; + case EDAC_REPAIR_BANK_SPARING: + sparing_pi.bank_group = cxl_sparing_ctx->bank_group; + sparing_pi.bank = cxl_sparing_ctx->bank; + fallthrough; + case EDAC_REPAIR_RANK_SPARING: + sparing_pi.rank = cxl_sparing_ctx->rank; + fallthrough; + default: + sparing_pi.channel = cxl_sparing_ctx->channel; + if ((rec && (validity_flags & CXL_DER_VALID_NIBBLE)) || + (!rec && (!cxl_sparing_ctx->nibble_mask || + (cxl_sparing_ctx->nibble_mask & 0xFFFFFF)))) { + sparing_pi.flags |= CXL_SET_SPARING_NIB_MASK_VALID(1); + put_unaligned_le24(cxl_sparing_ctx->nibble_mask, + sparing_pi.nibble_mask); + } + break; + } + + return cxl_perform_maintenance(&cxlmd->cxlds->cxl_mbox, + cxl_sparing_ctx->op_class, + cxl_sparing_ctx->op_subclass, + &sparing_pi, sizeof(sparing_pi)); +} + +static int cxl_mem_sparing_get_repair_type(struct device *dev, void *drv_data, + const char **repair_type) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + + switch (ctx->repair_type) { + case EDAC_REPAIR_CACHELINE_SPARING: + case EDAC_REPAIR_ROW_SPARING: + case EDAC_REPAIR_BANK_SPARING: + case EDAC_REPAIR_RANK_SPARING: + *repair_type = edac_repair_type[ctx->repair_type]; + break; + default: + return -EINVAL; + } + + return 0; +} + +#define CXL_SPARING_GET_ATTR(attrb, data_type) \ + static int cxl_mem_sparing_get_##attrb( \ + struct device *dev, void *drv_data, data_type *val) \ + { \ + struct cxl_mem_sparing_context *ctx = drv_data; \ + \ + *val = ctx->attrb; \ + \ + return 0; \ + } +CXL_SPARING_GET_ATTR(persist_mode, bool) +CXL_SPARING_GET_ATTR(dpa, u64) +CXL_SPARING_GET_ATTR(nibble_mask, u32) +CXL_SPARING_GET_ATTR(bank_group, u32) +CXL_SPARING_GET_ATTR(bank, u32) +CXL_SPARING_GET_ATTR(rank, u32) +CXL_SPARING_GET_ATTR(row, u32) +CXL_SPARING_GET_ATTR(column, u32) +CXL_SPARING_GET_ATTR(channel, u32) +CXL_SPARING_GET_ATTR(sub_channel, u32) + +#define CXL_SPARING_SET_ATTR(attrb, data_type) \ + static int cxl_mem_sparing_set_##attrb(struct device *dev, \ + void *drv_data, data_type val) \ + { \ + struct cxl_mem_sparing_context *ctx = drv_data; \ + \ + ctx->attrb = val; \ + \ + return 0; \ + } +CXL_SPARING_SET_ATTR(nibble_mask, u32) +CXL_SPARING_SET_ATTR(bank_group, u32) +CXL_SPARING_SET_ATTR(bank, u32) +CXL_SPARING_SET_ATTR(rank, u32) +CXL_SPARING_SET_ATTR(row, u32) +CXL_SPARING_SET_ATTR(column, u32) +CXL_SPARING_SET_ATTR(channel, u32) +CXL_SPARING_SET_ATTR(sub_channel, u32) + +static int cxl_mem_sparing_set_persist_mode(struct device *dev, void *drv_data, + bool persist_mode) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + + if ((persist_mode && ctx->cap_hard_sparing) || + (!persist_mode && ctx->cap_soft_sparing)) + ctx->persist_mode = persist_mode; + else + return -EOPNOTSUPP; + + return 0; +} + +static int cxl_get_mem_sparing_safe_when_in_use(struct device *dev, + void *drv_data, bool *safe) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + + *safe = ctx->cap_safe_when_in_use; + + return 0; +} + +static int cxl_mem_sparing_get_min_dpa(struct device *dev, void *drv_data, + u64 *min_dpa) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + struct cxl_memdev *cxlmd = ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + *min_dpa = cxlds->dpa_res.start; + + return 0; +} + +static int cxl_mem_sparing_get_max_dpa(struct device *dev, void *drv_data, + u64 *max_dpa) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + struct cxl_memdev *cxlmd = ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + *max_dpa = cxlds->dpa_res.end; + + return 0; +} + +static int cxl_mem_sparing_set_dpa(struct device *dev, void *drv_data, u64 dpa) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + struct cxl_memdev *cxlmd = ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa)) + return -EINVAL; + + ctx->dpa = dpa; + + return 0; +} + +static int cxl_do_mem_sparing(struct device *dev, void *drv_data, u32 val) +{ + struct cxl_mem_sparing_context *ctx = drv_data; + + if (val != EDAC_DO_MEM_REPAIR) + return -EINVAL; + + return cxl_mem_perform_sparing(dev, ctx); +} + +#define RANK_OPS \ + .get_repair_type = cxl_mem_sparing_get_repair_type, \ + .get_persist_mode = cxl_mem_sparing_get_persist_mode, \ + .set_persist_mode = cxl_mem_sparing_set_persist_mode, \ + .get_repair_safe_when_in_use = cxl_get_mem_sparing_safe_when_in_use, \ + .get_min_dpa = cxl_mem_sparing_get_min_dpa, \ + .get_max_dpa = cxl_mem_sparing_get_max_dpa, \ + .get_dpa = cxl_mem_sparing_get_dpa, \ + .set_dpa = cxl_mem_sparing_set_dpa, \ + .get_nibble_mask = cxl_mem_sparing_get_nibble_mask, \ + .set_nibble_mask = cxl_mem_sparing_set_nibble_mask, \ + .get_rank = cxl_mem_sparing_get_rank, \ + .set_rank = cxl_mem_sparing_set_rank, \ + .get_channel = cxl_mem_sparing_get_channel, \ + .set_channel = cxl_mem_sparing_set_channel, \ + .do_repair = cxl_do_mem_sparing + +#define BANK_OPS \ + RANK_OPS, .get_bank_group = cxl_mem_sparing_get_bank_group, \ + .set_bank_group = cxl_mem_sparing_set_bank_group, \ + .get_bank = cxl_mem_sparing_get_bank, \ + .set_bank = cxl_mem_sparing_set_bank + +#define ROW_OPS \ + BANK_OPS, .get_row = cxl_mem_sparing_get_row, \ + .set_row = cxl_mem_sparing_set_row + +#define CACHELINE_OPS \ + ROW_OPS, .get_column = cxl_mem_sparing_get_column, \ + .set_column = cxl_mem_sparing_set_column, \ + .get_sub_channel = cxl_mem_sparing_get_sub_channel, \ + .set_sub_channel = cxl_mem_sparing_set_sub_channel + +static const struct edac_mem_repair_ops cxl_rank_sparing_ops = { + RANK_OPS, +}; + +static const struct edac_mem_repair_ops cxl_bank_sparing_ops = { + BANK_OPS, +}; + +static const struct edac_mem_repair_ops cxl_row_sparing_ops = { + ROW_OPS, +}; + +static const struct edac_mem_repair_ops cxl_cacheline_sparing_ops = { + CACHELINE_OPS, +}; + +struct cxl_mem_sparing_desc { + const uuid_t repair_uuid; + enum edac_mem_repair_type repair_type; + const struct edac_mem_repair_ops *repair_ops; +}; + +static const struct cxl_mem_sparing_desc mem_sparing_desc[] = { + { + .repair_uuid = CXL_FEAT_CACHELINE_SPARING_UUID, + .repair_type = EDAC_REPAIR_CACHELINE_SPARING, + .repair_ops = &cxl_cacheline_sparing_ops, + }, + { + .repair_uuid = CXL_FEAT_ROW_SPARING_UUID, + .repair_type = EDAC_REPAIR_ROW_SPARING, + .repair_ops = &cxl_row_sparing_ops, + }, + { + .repair_uuid = CXL_FEAT_BANK_SPARING_UUID, + .repair_type = EDAC_REPAIR_BANK_SPARING, + .repair_ops = &cxl_bank_sparing_ops, + }, + { + .repair_uuid = CXL_FEAT_RANK_SPARING_UUID, + .repair_type = EDAC_REPAIR_RANK_SPARING, + .repair_ops = &cxl_rank_sparing_ops, + }, +}; + +static int cxl_memdev_sparing_init(struct cxl_memdev *cxlmd, + struct edac_dev_feature *ras_feature, + const struct cxl_mem_sparing_desc *desc, + u8 repair_inst) +{ + struct cxl_mem_sparing_context *cxl_sparing_ctx; + struct cxl_feat_entry *feat_entry; + int ret; + + feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds), + &desc->repair_uuid); + if (IS_ERR(feat_entry)) + return -EOPNOTSUPP; + + if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE)) + return -EOPNOTSUPP; + + cxl_sparing_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_sparing_ctx), + GFP_KERNEL); + if (!cxl_sparing_ctx) + return -ENOMEM; + + *cxl_sparing_ctx = (struct cxl_mem_sparing_context){ + .get_feat_size = le16_to_cpu(feat_entry->get_feat_size), + .set_feat_size = le16_to_cpu(feat_entry->set_feat_size), + .get_version = feat_entry->get_feat_ver, + .set_version = feat_entry->set_feat_ver, + .effects = le16_to_cpu(feat_entry->effects), + .cxlmd = cxlmd, + .repair_type = desc->repair_type, + .instance = repair_inst++, + }; + uuid_copy(&cxl_sparing_ctx->repair_uuid, &desc->repair_uuid); + + ret = cxl_mem_sparing_get_attrbs(cxl_sparing_ctx); + if (ret) + return ret; + + if ((cxl_sparing_ctx->cap_soft_sparing && + cxl_sparing_ctx->cap_hard_sparing) || + cxl_sparing_ctx->cap_soft_sparing) + cxl_sparing_ctx->persist_mode = 0; + else if (cxl_sparing_ctx->cap_hard_sparing) + cxl_sparing_ctx->persist_mode = 1; + else + return -EOPNOTSUPP; + + ras_feature->ft_type = RAS_FEAT_MEM_REPAIR; + ras_feature->instance = cxl_sparing_ctx->instance; + ras_feature->mem_repair_ops = desc->repair_ops; + ras_feature->ctx = cxl_sparing_ctx; + + return 0; +} + +/* + * CXL memory soft PPR & hard PPR control + */ +struct cxl_ppr_context { + uuid_t repair_uuid; + u8 instance; + u16 get_feat_size; + u16 set_feat_size; + u8 get_version; + u8 set_version; + u16 effects; + u8 op_class; + u8 op_subclass; + bool cap_dpa; + bool cap_nib_mask; + bool media_accessible; + bool data_retained; + struct cxl_memdev *cxlmd; + enum edac_mem_repair_type repair_type; + bool persist_mode; + u64 dpa; + u32 nibble_mask; +}; + +/* + * See CXL rev 3.2 @8.2.10.7.2.1 Table 8-128 sPPR Feature Readable Attributes + * + * See CXL rev 3.2 @8.2.10.7.2.2 Table 8-131 hPPR Feature Readable Attributes + */ + +#define CXL_PPR_OP_CAP_DEVICE_INITIATED BIT(0) +#define CXL_PPR_OP_MODE_DEV_INITIATED BIT(0) + +#define CXL_PPR_FLAG_DPA_SUPPORT_MASK BIT(0) +#define CXL_PPR_FLAG_NIB_SUPPORT_MASK BIT(1) +#define CXL_PPR_FLAG_MEM_SPARING_EV_REC_SUPPORT_MASK BIT(2) +#define CXL_PPR_FLAG_DEV_INITED_PPR_AT_BOOT_CAP_MASK BIT(3) + +#define CXL_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK BIT(0) +#define CXL_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK BIT(2) + +#define CXL_PPR_SPARING_EV_REC_EN_MASK BIT(0) +#define CXL_PPR_DEV_INITED_PPR_AT_BOOT_EN_MASK BIT(1) + +#define CXL_PPR_GET_CAP_DPA(flags) \ + FIELD_GET(CXL_PPR_FLAG_DPA_SUPPORT_MASK, flags) +#define CXL_PPR_GET_CAP_NIB_MASK(flags) \ + FIELD_GET(CXL_PPR_FLAG_NIB_SUPPORT_MASK, flags) +#define CXL_PPR_GET_MEDIA_ACCESSIBLE(restriction_flags) \ + (FIELD_GET(CXL_PPR_RESTRICTION_FLAG_MEDIA_ACCESSIBLE_MASK, \ + restriction_flags) ^ 1) +#define CXL_PPR_GET_DATA_RETAINED(restriction_flags) \ + (FIELD_GET(CXL_PPR_RESTRICTION_FLAG_DATA_RETAINED_MASK, \ + restriction_flags) ^ 1) + +struct cxl_memdev_ppr_rd_attrbs { + struct cxl_memdev_repair_rd_attrbs_hdr hdr; + u8 ppr_flags; + __le16 restriction_flags; + u8 ppr_op_mode; +} __packed; + +/* + * See CXL rev 3.2 @8.2.10.7.1.2 Table 8-118 sPPR Maintenance Input Payload + * + * See CXL rev 3.2 @8.2.10.7.1.3 Table 8-119 hPPR Maintenance Input Payload + */ +struct cxl_memdev_ppr_maintenance_attrbs { + u8 flags; + __le64 dpa; + u8 nibble_mask[3]; +} __packed; + +static int cxl_mem_ppr_get_attrbs(struct cxl_ppr_context *cxl_ppr_ctx) +{ + size_t rd_data_size = sizeof(struct cxl_memdev_ppr_rd_attrbs); + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + u16 restriction_flags; + size_t data_size; + u16 return_code; + + struct cxl_memdev_ppr_rd_attrbs *rd_attrbs __free(kfree) = + kmalloc(rd_data_size, GFP_KERNEL); + if (!rd_attrbs) + return -ENOMEM; + + data_size = cxl_get_feature(cxl_mbox, &cxl_ppr_ctx->repair_uuid, + CXL_GET_FEAT_SEL_CURRENT_VALUE, rd_attrbs, + rd_data_size, 0, &return_code); + if (!data_size) + return -EIO; + + cxl_ppr_ctx->op_class = rd_attrbs->hdr.op_class; + cxl_ppr_ctx->op_subclass = rd_attrbs->hdr.op_subclass; + cxl_ppr_ctx->cap_dpa = CXL_PPR_GET_CAP_DPA(rd_attrbs->ppr_flags); + cxl_ppr_ctx->cap_nib_mask = + CXL_PPR_GET_CAP_NIB_MASK(rd_attrbs->ppr_flags); + + restriction_flags = le16_to_cpu(rd_attrbs->restriction_flags); + cxl_ppr_ctx->media_accessible = + CXL_PPR_GET_MEDIA_ACCESSIBLE(restriction_flags); + cxl_ppr_ctx->data_retained = + CXL_PPR_GET_DATA_RETAINED(restriction_flags); + + return 0; +} + +static int cxl_mem_perform_ppr(struct cxl_ppr_context *cxl_ppr_ctx) +{ + struct cxl_memdev_ppr_maintenance_attrbs maintenance_attrbs; + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_mem_repair_attrbs attrbs = { 0 }; + int ret; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return ret; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((ret = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return ret; + + if (!cxl_ppr_ctx->media_accessible || !cxl_ppr_ctx->data_retained) { + /* Memory to repair must be offline */ + if (cxl_is_memdev_memory_online(cxlmd)) + return -EBUSY; + } else { + if (cxl_is_memdev_memory_online(cxlmd)) { + /* Check memory to repair is from the current boot */ + attrbs.repair_type = CXL_PPR; + attrbs.dpa = cxl_ppr_ctx->dpa; + attrbs.nibble_mask = cxl_ppr_ctx->nibble_mask; + if (!cxl_find_rec_dram(cxlmd, &attrbs) && + !cxl_find_rec_gen_media(cxlmd, &attrbs)) + return -EINVAL; + } + } + + memset(&maintenance_attrbs, 0, sizeof(maintenance_attrbs)); + maintenance_attrbs.flags = 0; + maintenance_attrbs.dpa = cpu_to_le64(cxl_ppr_ctx->dpa); + put_unaligned_le24(cxl_ppr_ctx->nibble_mask, + maintenance_attrbs.nibble_mask); + + return cxl_perform_maintenance(&cxlmd->cxlds->cxl_mbox, + cxl_ppr_ctx->op_class, + cxl_ppr_ctx->op_subclass, + &maintenance_attrbs, + sizeof(maintenance_attrbs)); +} + +static int cxl_ppr_get_repair_type(struct device *dev, void *drv_data, + const char **repair_type) +{ + *repair_type = edac_repair_type[EDAC_REPAIR_PPR]; + + return 0; +} + +static int cxl_ppr_get_persist_mode(struct device *dev, void *drv_data, + bool *persist_mode) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + + *persist_mode = cxl_ppr_ctx->persist_mode; + + return 0; +} + +static int cxl_get_ppr_safe_when_in_use(struct device *dev, void *drv_data, + bool *safe) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + + *safe = cxl_ppr_ctx->media_accessible & cxl_ppr_ctx->data_retained; + + return 0; +} + +static int cxl_ppr_get_min_dpa(struct device *dev, void *drv_data, u64 *min_dpa) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + *min_dpa = cxlds->dpa_res.start; + + return 0; +} + +static int cxl_ppr_get_max_dpa(struct device *dev, void *drv_data, u64 *max_dpa) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + *max_dpa = cxlds->dpa_res.end; + + return 0; +} + +static int cxl_ppr_get_dpa(struct device *dev, void *drv_data, u64 *dpa) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + + *dpa = cxl_ppr_ctx->dpa; + + return 0; +} + +static int cxl_ppr_set_dpa(struct device *dev, void *drv_data, u64 dpa) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa)) + return -EINVAL; + + cxl_ppr_ctx->dpa = dpa; + + return 0; +} + +static int cxl_ppr_get_nibble_mask(struct device *dev, void *drv_data, + u32 *nibble_mask) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + + *nibble_mask = cxl_ppr_ctx->nibble_mask; + + return 0; +} + +static int cxl_ppr_set_nibble_mask(struct device *dev, void *drv_data, + u32 nibble_mask) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + + cxl_ppr_ctx->nibble_mask = nibble_mask; + + return 0; +} + +static int cxl_do_ppr(struct device *dev, void *drv_data, u32 val) +{ + struct cxl_ppr_context *cxl_ppr_ctx = drv_data; + struct cxl_memdev *cxlmd = cxl_ppr_ctx->cxlmd; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + if (val != EDAC_DO_MEM_REPAIR || + !cxl_resource_contains_addr(&cxlds->dpa_res, cxl_ppr_ctx->dpa)) + return -EINVAL; + + return cxl_mem_perform_ppr(cxl_ppr_ctx); +} + +static const struct edac_mem_repair_ops cxl_sppr_ops = { + .get_repair_type = cxl_ppr_get_repair_type, + .get_persist_mode = cxl_ppr_get_persist_mode, + .get_repair_safe_when_in_use = cxl_get_ppr_safe_when_in_use, + .get_min_dpa = cxl_ppr_get_min_dpa, + .get_max_dpa = cxl_ppr_get_max_dpa, + .get_dpa = cxl_ppr_get_dpa, + .set_dpa = cxl_ppr_set_dpa, + .get_nibble_mask = cxl_ppr_get_nibble_mask, + .set_nibble_mask = cxl_ppr_set_nibble_mask, + .do_repair = cxl_do_ppr, +}; + +static int cxl_memdev_soft_ppr_init(struct cxl_memdev *cxlmd, + struct edac_dev_feature *ras_feature, + u8 repair_inst) +{ + struct cxl_ppr_context *cxl_sppr_ctx; + struct cxl_feat_entry *feat_entry; + int ret; + + feat_entry = cxl_feature_info(to_cxlfs(cxlmd->cxlds), + &CXL_FEAT_SPPR_UUID); + if (IS_ERR(feat_entry)) + return -EOPNOTSUPP; + + if (!(le32_to_cpu(feat_entry->flags) & CXL_FEATURE_F_CHANGEABLE)) + return -EOPNOTSUPP; + + cxl_sppr_ctx = + devm_kzalloc(&cxlmd->dev, sizeof(*cxl_sppr_ctx), GFP_KERNEL); + if (!cxl_sppr_ctx) + return -ENOMEM; + + *cxl_sppr_ctx = (struct cxl_ppr_context){ + .get_feat_size = le16_to_cpu(feat_entry->get_feat_size), + .set_feat_size = le16_to_cpu(feat_entry->set_feat_size), + .get_version = feat_entry->get_feat_ver, + .set_version = feat_entry->set_feat_ver, + .effects = le16_to_cpu(feat_entry->effects), + .cxlmd = cxlmd, + .repair_type = EDAC_REPAIR_PPR, + .persist_mode = 0, + .instance = repair_inst, + }; + uuid_copy(&cxl_sppr_ctx->repair_uuid, &CXL_FEAT_SPPR_UUID); + + ret = cxl_mem_ppr_get_attrbs(cxl_sppr_ctx); + if (ret) + return ret; + + ras_feature->ft_type = RAS_FEAT_MEM_REPAIR; + ras_feature->instance = cxl_sppr_ctx->instance; + ras_feature->mem_repair_ops = &cxl_sppr_ops; + ras_feature->ctx = cxl_sppr_ctx; + + return 0; +} + +int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) +{ + struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES]; + int num_ras_features = 0; + u8 repair_inst = 0; + int rc; + + if (IS_ENABLED(CONFIG_CXL_EDAC_SCRUB)) { + rc = cxl_memdev_scrub_init(cxlmd, &ras_features[num_ras_features], 0); + if (rc < 0 && rc != -EOPNOTSUPP) + return rc; + + if (rc != -EOPNOTSUPP) + num_ras_features++; + } + + if (IS_ENABLED(CONFIG_CXL_EDAC_ECS)) { + rc = cxl_memdev_ecs_init(cxlmd, &ras_features[num_ras_features]); + if (rc < 0 && rc != -EOPNOTSUPP) + return rc; + + if (rc != -EOPNOTSUPP) + num_ras_features++; + } + + if (IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR)) { + for (int i = 0; i < CXL_MEM_SPARING_MAX; i++) { + rc = cxl_memdev_sparing_init(cxlmd, + &ras_features[num_ras_features], + &mem_sparing_desc[i], repair_inst); + if (rc == -EOPNOTSUPP) + continue; + if (rc < 0) + return rc; + + repair_inst++; + num_ras_features++; + } + + rc = cxl_memdev_soft_ppr_init(cxlmd, &ras_features[num_ras_features], + repair_inst); + if (rc < 0 && rc != -EOPNOTSUPP) + return rc; + + if (rc != -EOPNOTSUPP) { + repair_inst++; + num_ras_features++; + } + + if (repair_inst) { + struct cxl_mem_err_rec *array_rec = + devm_kzalloc(&cxlmd->dev, sizeof(*array_rec), + GFP_KERNEL); + if (!array_rec) + return -ENOMEM; + + xa_init(&array_rec->rec_gen_media); + xa_init(&array_rec->rec_dram); + cxlmd->err_rec_array = array_rec; + } + } + + if (!num_ras_features) + return -EINVAL; + + char *cxl_dev_name __free(kfree) = + kasprintf(GFP_KERNEL, "cxl_%s", dev_name(&cxlmd->dev)); + if (!cxl_dev_name) + return -ENOMEM; + + return edac_dev_register(&cxlmd->dev, cxl_dev_name, NULL, + num_ras_features, ras_features); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_register, "CXL"); + +int devm_cxl_region_edac_register(struct cxl_region *cxlr) +{ + struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES]; + int num_ras_features = 0; + int rc; + + if (!IS_ENABLED(CONFIG_CXL_EDAC_SCRUB)) + return 0; + + rc = cxl_region_scrub_init(cxlr, &ras_features[num_ras_features], 0); + if (rc < 0) + return rc; + + num_ras_features++; + + char *cxl_dev_name __free(kfree) = + kasprintf(GFP_KERNEL, "cxl_%s", dev_name(&cxlr->dev)); + if (!cxl_dev_name) + return -ENOMEM; + + return edac_dev_register(&cxlr->dev, cxl_dev_name, NULL, + num_ras_features, ras_features); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_region_edac_register, "CXL"); + +void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd) +{ + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_gen_media *rec_gen_media; + struct cxl_event_dram *rec_dram; + unsigned long index; + + if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec) + return; + + xa_for_each(&array_rec->rec_dram, index, rec_dram) + kfree(rec_dram); + xa_destroy(&array_rec->rec_dram); + + xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media) + kfree(rec_gen_media); + xa_destroy(&array_rec->rec_gen_media); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_release, "CXL"); diff --git a/drivers/cxl/core/features.c b/drivers/cxl/core/features.c new file mode 100644 index 000000000000..4bc484b46f43 --- /dev/null +++ b/drivers/cxl/core/features.c @@ -0,0 +1,706 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024-2025 Intel Corporation. All rights reserved. */ +#include <linux/fwctl.h> +#include <linux/device.h> +#include <cxl/mailbox.h> +#include <cxl/features.h> +#include <uapi/fwctl/cxl.h> +#include "cxl.h" +#include "core.h" +#include "cxlmem.h" + +/** + * DOC: cxl features + * + * CXL Features: + * A CXL device that includes a mailbox supports commands that allows + * listing, getting, and setting of optionally defined features such + * as memory sparing or post package sparing. Vendors may define custom + * features for the device. + */ + +/* All the features below are exclusive to the kernel */ +static const uuid_t cxl_exclusive_feats[] = { + CXL_FEAT_PATROL_SCRUB_UUID, + CXL_FEAT_ECS_UUID, + CXL_FEAT_SPPR_UUID, + CXL_FEAT_HPPR_UUID, + CXL_FEAT_CACHELINE_SPARING_UUID, + CXL_FEAT_ROW_SPARING_UUID, + CXL_FEAT_BANK_SPARING_UUID, + CXL_FEAT_RANK_SPARING_UUID, +}; + +static bool is_cxl_feature_exclusive_by_uuid(const uuid_t *uuid) +{ + for (int i = 0; i < ARRAY_SIZE(cxl_exclusive_feats); i++) { + if (uuid_equal(uuid, &cxl_exclusive_feats[i])) + return true; + } + + return false; +} + +static bool is_cxl_feature_exclusive(struct cxl_feat_entry *entry) +{ + return is_cxl_feature_exclusive_by_uuid(&entry->uuid); +} + +struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds) +{ + return cxlds->cxlfs; +} +EXPORT_SYMBOL_NS_GPL(to_cxlfs, "CXL"); + +static int cxl_get_supported_features_count(struct cxl_mailbox *cxl_mbox) +{ + struct cxl_mbox_get_sup_feats_out mbox_out; + struct cxl_mbox_get_sup_feats_in mbox_in; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + memset(&mbox_in, 0, sizeof(mbox_in)); + mbox_in.count = cpu_to_le32(sizeof(mbox_out)); + memset(&mbox_out, 0, sizeof(mbox_out)); + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_SUPPORTED_FEATURES, + .size_in = sizeof(mbox_in), + .payload_in = &mbox_in, + .size_out = sizeof(mbox_out), + .payload_out = &mbox_out, + .min_out = sizeof(mbox_out), + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) + return rc; + + return le16_to_cpu(mbox_out.supported_feats); +} + +static struct cxl_feat_entries * +get_supported_features(struct cxl_features_state *cxlfs) +{ + int remain_feats, max_size, max_feats, start, rc, hdr_size; + struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox; + int feat_size = sizeof(struct cxl_feat_entry); + struct cxl_mbox_get_sup_feats_in mbox_in; + struct cxl_feat_entry *entry; + struct cxl_mbox_cmd mbox_cmd; + int user_feats = 0; + int count; + + count = cxl_get_supported_features_count(cxl_mbox); + if (count <= 0) + return NULL; + + struct cxl_feat_entries *entries __free(kvfree) = + kvmalloc(struct_size(entries, ent, count), GFP_KERNEL); + if (!entries) + return NULL; + + struct cxl_mbox_get_sup_feats_out *mbox_out __free(kvfree) = + kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); + if (!mbox_out) + return NULL; + + hdr_size = struct_size(mbox_out, ents, 0); + max_size = cxl_mbox->payload_size - hdr_size; + /* max feat entries that can fit in mailbox max payload size */ + max_feats = max_size / feat_size; + entry = entries->ent; + + start = 0; + remain_feats = count; + do { + int retrieved, alloc_size, copy_feats; + int num_entries; + + if (remain_feats > max_feats) { + alloc_size = struct_size(mbox_out, ents, max_feats); + remain_feats = remain_feats - max_feats; + copy_feats = max_feats; + } else { + alloc_size = struct_size(mbox_out, ents, remain_feats); + copy_feats = remain_feats; + remain_feats = 0; + } + + memset(&mbox_in, 0, sizeof(mbox_in)); + mbox_in.count = cpu_to_le32(alloc_size); + mbox_in.start_idx = cpu_to_le16(start); + memset(mbox_out, 0, alloc_size); + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_SUPPORTED_FEATURES, + .size_in = sizeof(mbox_in), + .payload_in = &mbox_in, + .size_out = alloc_size, + .payload_out = mbox_out, + .min_out = hdr_size, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) + return NULL; + + if (mbox_cmd.size_out <= hdr_size) + return NULL; + + /* + * Make sure retrieved out buffer is multiple of feature + * entries. + */ + retrieved = mbox_cmd.size_out - hdr_size; + if (retrieved % feat_size) + return NULL; + + num_entries = le16_to_cpu(mbox_out->num_entries); + /* + * If the reported output entries * defined entry size != + * retrieved output bytes, then the output package is incorrect. + */ + if (num_entries * feat_size != retrieved) + return NULL; + + memcpy(entry, mbox_out->ents, retrieved); + for (int i = 0; i < num_entries; i++) { + if (!is_cxl_feature_exclusive(entry + i)) + user_feats++; + } + entry += num_entries; + /* + * If the number of output entries is less than expected, add the + * remaining entries to the next batch. + */ + remain_feats += copy_feats - num_entries; + start += num_entries; + } while (remain_feats); + + entries->num_features = count; + entries->num_user_features = user_feats; + + return no_free_ptr(entries); +} + +static void free_cxlfs(void *_cxlfs) +{ + struct cxl_features_state *cxlfs = _cxlfs; + struct cxl_dev_state *cxlds = cxlfs->cxlds; + + cxlds->cxlfs = NULL; + kvfree(cxlfs->entries); + kfree(cxlfs); +} + +/** + * devm_cxl_setup_features() - Allocate and initialize features context + * @cxlds: CXL device context + * + * Return 0 on success or -errno on failure. + */ +int devm_cxl_setup_features(struct cxl_dev_state *cxlds) +{ + struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; + + if (cxl_mbox->feat_cap < CXL_FEATURES_RO) + return -ENODEV; + + struct cxl_features_state *cxlfs __free(kfree) = + kzalloc(sizeof(*cxlfs), GFP_KERNEL); + if (!cxlfs) + return -ENOMEM; + + cxlfs->cxlds = cxlds; + + cxlfs->entries = get_supported_features(cxlfs); + if (!cxlfs->entries) + return -ENOMEM; + + cxlds->cxlfs = cxlfs; + + return devm_add_action_or_reset(cxlds->dev, free_cxlfs, no_free_ptr(cxlfs)); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_features, "CXL"); + +size_t cxl_get_feature(struct cxl_mailbox *cxl_mbox, const uuid_t *feat_uuid, + enum cxl_get_feat_selection selection, + void *feat_out, size_t feat_out_size, u16 offset, + u16 *return_code) +{ + size_t data_to_rd_size, size_out; + struct cxl_mbox_get_feat_in pi; + struct cxl_mbox_cmd mbox_cmd; + size_t data_rcvd_size = 0; + int rc; + + if (return_code) + *return_code = CXL_MBOX_CMD_RC_INPUT; + + if (!feat_out || !feat_out_size) + return 0; + + size_out = min(feat_out_size, cxl_mbox->payload_size); + uuid_copy(&pi.uuid, feat_uuid); + pi.selection = selection; + do { + data_to_rd_size = min(feat_out_size - data_rcvd_size, + cxl_mbox->payload_size); + pi.offset = cpu_to_le16(offset + data_rcvd_size); + pi.count = cpu_to_le16(data_to_rd_size); + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_FEATURE, + .size_in = sizeof(pi), + .payload_in = &pi, + .size_out = size_out, + .payload_out = feat_out + data_rcvd_size, + .min_out = data_to_rd_size, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0 || !mbox_cmd.size_out) { + if (return_code) + *return_code = mbox_cmd.return_code; + return 0; + } + data_rcvd_size += mbox_cmd.size_out; + } while (data_rcvd_size < feat_out_size); + + if (return_code) + *return_code = CXL_MBOX_CMD_RC_SUCCESS; + + return data_rcvd_size; +} + +/* + * FEAT_DATA_MIN_PAYLOAD_SIZE - min extra number of bytes should be + * available in the mailbox for storing the actual feature data so that + * the feature data transfer would work as expected. + */ +#define FEAT_DATA_MIN_PAYLOAD_SIZE 10 +int cxl_set_feature(struct cxl_mailbox *cxl_mbox, + const uuid_t *feat_uuid, u8 feat_version, + const void *feat_data, size_t feat_data_size, + u32 feat_flag, u16 offset, u16 *return_code) +{ + size_t data_in_size, data_sent_size = 0; + struct cxl_mbox_cmd mbox_cmd; + size_t hdr_size; + + if (return_code) + *return_code = CXL_MBOX_CMD_RC_INPUT; + + struct cxl_mbox_set_feat_in *pi __free(kfree) = + kzalloc(cxl_mbox->payload_size, GFP_KERNEL); + if (!pi) + return -ENOMEM; + + uuid_copy(&pi->uuid, feat_uuid); + pi->version = feat_version; + feat_flag &= ~CXL_SET_FEAT_FLAG_DATA_TRANSFER_MASK; + feat_flag |= CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET; + hdr_size = sizeof(pi->hdr); + /* + * Check minimum mbox payload size is available for + * the feature data transfer. + */ + if (hdr_size + FEAT_DATA_MIN_PAYLOAD_SIZE > cxl_mbox->payload_size) + return -ENOMEM; + + if (hdr_size + feat_data_size <= cxl_mbox->payload_size) { + pi->flags = cpu_to_le32(feat_flag | + CXL_SET_FEAT_FLAG_FULL_DATA_TRANSFER); + data_in_size = feat_data_size; + } else { + pi->flags = cpu_to_le32(feat_flag | + CXL_SET_FEAT_FLAG_INITIATE_DATA_TRANSFER); + data_in_size = cxl_mbox->payload_size - hdr_size; + } + + do { + int rc; + + pi->offset = cpu_to_le16(offset + data_sent_size); + memcpy(pi->feat_data, feat_data + data_sent_size, data_in_size); + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_SET_FEATURE, + .size_in = hdr_size + data_in_size, + .payload_in = pi, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) { + if (return_code) + *return_code = mbox_cmd.return_code; + return rc; + } + + data_sent_size += data_in_size; + if (data_sent_size >= feat_data_size) { + if (return_code) + *return_code = CXL_MBOX_CMD_RC_SUCCESS; + return 0; + } + + if ((feat_data_size - data_sent_size) <= (cxl_mbox->payload_size - hdr_size)) { + data_in_size = feat_data_size - data_sent_size; + pi->flags = cpu_to_le32(feat_flag | + CXL_SET_FEAT_FLAG_FINISH_DATA_TRANSFER); + } else { + pi->flags = cpu_to_le32(feat_flag | + CXL_SET_FEAT_FLAG_CONTINUE_DATA_TRANSFER); + } + } while (true); +} + +/* FWCTL support */ + +static inline struct cxl_memdev *fwctl_to_memdev(struct fwctl_device *fwctl_dev) +{ + return to_cxl_memdev(fwctl_dev->dev.parent); +} + +static int cxlctl_open_uctx(struct fwctl_uctx *uctx) +{ + return 0; +} + +static void cxlctl_close_uctx(struct fwctl_uctx *uctx) +{ +} + +struct cxl_feat_entry * +cxl_feature_info(struct cxl_features_state *cxlfs, + const uuid_t *uuid) +{ + struct cxl_feat_entry *feat; + + if (!cxlfs || !cxlfs->entries) + return ERR_PTR(-EOPNOTSUPP); + + for (int i = 0; i < cxlfs->entries->num_features; i++) { + feat = &cxlfs->entries->ent[i]; + if (uuid_equal(uuid, &feat->uuid)) + return feat; + } + + return ERR_PTR(-EINVAL); +} + +static void *cxlctl_get_supported_features(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + size_t *out_len) +{ + const struct cxl_mbox_get_sup_feats_in *feat_in; + struct cxl_mbox_get_sup_feats_out *feat_out; + struct cxl_feat_entry *pos; + size_t out_size; + int requested; + u32 count; + u16 start; + int i; + + if (rpc_in->op_size != sizeof(*feat_in)) + return ERR_PTR(-EINVAL); + + feat_in = &rpc_in->get_sup_feats_in; + count = le32_to_cpu(feat_in->count); + start = le16_to_cpu(feat_in->start_idx); + requested = count / sizeof(*pos); + + /* + * Make sure that the total requested number of entries is not greater + * than the total number of supported features allowed for userspace. + */ + if (start >= cxlfs->entries->num_features) + return ERR_PTR(-EINVAL); + + requested = min_t(int, requested, cxlfs->entries->num_features - start); + + out_size = sizeof(struct fwctl_rpc_cxl_out) + + struct_size(feat_out, ents, requested); + + struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) = + kvzalloc(out_size, GFP_KERNEL); + if (!rpc_out) + return ERR_PTR(-ENOMEM); + + rpc_out->size = struct_size(feat_out, ents, requested); + feat_out = &rpc_out->get_sup_feats_out; + + for (i = start, pos = &feat_out->ents[0]; + i < cxlfs->entries->num_features; i++, pos++) { + if (i - start == requested) + break; + + memcpy(pos, &cxlfs->entries->ent[i], sizeof(*pos)); + /* + * If the feature is exclusive, set the set_feat_size to 0 to + * indicate that the feature is not changeable. + */ + if (is_cxl_feature_exclusive(pos)) { + u32 flags; + + pos->set_feat_size = 0; + flags = le32_to_cpu(pos->flags); + flags &= ~CXL_FEATURE_F_CHANGEABLE; + pos->flags = cpu_to_le32(flags); + } + } + + feat_out->num_entries = cpu_to_le16(requested); + feat_out->supported_feats = cpu_to_le16(cxlfs->entries->num_features); + rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS; + *out_len = out_size; + + return no_free_ptr(rpc_out); +} + +static void *cxlctl_get_feature(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + size_t *out_len) +{ + struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox; + const struct cxl_mbox_get_feat_in *feat_in; + u16 offset, count, return_code; + size_t out_size = *out_len; + + if (rpc_in->op_size != sizeof(*feat_in)) + return ERR_PTR(-EINVAL); + + feat_in = &rpc_in->get_feat_in; + offset = le16_to_cpu(feat_in->offset); + count = le16_to_cpu(feat_in->count); + + if (!count) + return ERR_PTR(-EINVAL); + + struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) = + kvzalloc(out_size, GFP_KERNEL); + if (!rpc_out) + return ERR_PTR(-ENOMEM); + + out_size = cxl_get_feature(cxl_mbox, &feat_in->uuid, + feat_in->selection, rpc_out->payload, + count, offset, &return_code); + *out_len = sizeof(struct fwctl_rpc_cxl_out); + if (!out_size) { + rpc_out->size = 0; + rpc_out->retval = return_code; + return no_free_ptr(rpc_out); + } + + rpc_out->size = out_size; + rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS; + *out_len += out_size; + + return no_free_ptr(rpc_out); +} + +static void *cxlctl_set_feature(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + size_t *out_len) +{ + struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox; + const struct cxl_mbox_set_feat_in *feat_in; + size_t out_size, data_size; + u16 offset, return_code; + u32 flags; + int rc; + + if (rpc_in->op_size <= sizeof(feat_in->hdr)) + return ERR_PTR(-EINVAL); + + feat_in = &rpc_in->set_feat_in; + + if (is_cxl_feature_exclusive_by_uuid(&feat_in->uuid)) + return ERR_PTR(-EPERM); + + offset = le16_to_cpu(feat_in->offset); + flags = le32_to_cpu(feat_in->flags); + out_size = *out_len; + + struct fwctl_rpc_cxl_out *rpc_out __free(kvfree) = + kvzalloc(out_size, GFP_KERNEL); + if (!rpc_out) + return ERR_PTR(-ENOMEM); + + rpc_out->size = 0; + + data_size = rpc_in->op_size - sizeof(feat_in->hdr); + rc = cxl_set_feature(cxl_mbox, &feat_in->uuid, + feat_in->version, feat_in->feat_data, + data_size, flags, offset, &return_code); + *out_len = sizeof(*rpc_out); + if (rc) { + rpc_out->retval = return_code; + return no_free_ptr(rpc_out); + } + + rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS; + + return no_free_ptr(rpc_out); +} + +static bool cxlctl_validate_set_features(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + enum fwctl_rpc_scope scope) +{ + u16 effects, imm_mask, reset_mask; + struct cxl_feat_entry *feat; + u32 flags; + + if (rpc_in->op_size < sizeof(uuid_t)) + return false; + + feat = cxl_feature_info(cxlfs, &rpc_in->set_feat_in.uuid); + if (IS_ERR(feat)) + return false; + + /* Ensure that the attribute is changeable */ + flags = le32_to_cpu(feat->flags); + if (!(flags & CXL_FEATURE_F_CHANGEABLE)) + return false; + + effects = le16_to_cpu(feat->effects); + + /* + * Reserved bits are set, rejecting since the effects is not + * comprehended by the driver. + */ + if (effects & CXL_CMD_EFFECTS_RESERVED) { + dev_warn_once(cxlfs->cxlds->dev, + "Reserved bits set in the Feature effects field!\n"); + return false; + } + + /* Currently no user background command support */ + if (effects & CXL_CMD_BACKGROUND) + return false; + + /* Effects cause immediate change, highest security scope is needed */ + imm_mask = CXL_CMD_CONFIG_CHANGE_IMMEDIATE | + CXL_CMD_DATA_CHANGE_IMMEDIATE | + CXL_CMD_POLICY_CHANGE_IMMEDIATE | + CXL_CMD_LOG_CHANGE_IMMEDIATE; + + reset_mask = CXL_CMD_CONFIG_CHANGE_COLD_RESET | + CXL_CMD_CONFIG_CHANGE_CONV_RESET | + CXL_CMD_CONFIG_CHANGE_CXL_RESET; + + /* If no immediate or reset effect set, The hardware has a bug */ + if (!(effects & imm_mask) && !(effects & reset_mask)) + return false; + + /* + * If the Feature setting causes immediate configuration change + * then we need the full write permission policy. + */ + if (effects & imm_mask && scope >= FWCTL_RPC_DEBUG_WRITE_FULL) + return true; + + /* + * If the Feature setting only causes configuration change + * after a reset, then the lesser level of write permission + * policy is ok. + */ + if (!(effects & imm_mask) && scope >= FWCTL_RPC_DEBUG_WRITE) + return true; + + return false; +} + +static bool cxlctl_validate_hw_command(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + enum fwctl_rpc_scope scope, + u16 opcode) +{ + struct cxl_mailbox *cxl_mbox = &cxlfs->cxlds->cxl_mbox; + + switch (opcode) { + case CXL_MBOX_OP_GET_SUPPORTED_FEATURES: + case CXL_MBOX_OP_GET_FEATURE: + return cxl_mbox->feat_cap >= CXL_FEATURES_RO; + case CXL_MBOX_OP_SET_FEATURE: + if (cxl_mbox->feat_cap < CXL_FEATURES_RW) + return false; + return cxlctl_validate_set_features(cxlfs, rpc_in, scope); + default: + return false; + } +} + +static void *cxlctl_handle_commands(struct cxl_features_state *cxlfs, + const struct fwctl_rpc_cxl *rpc_in, + size_t *out_len, u16 opcode) +{ + switch (opcode) { + case CXL_MBOX_OP_GET_SUPPORTED_FEATURES: + return cxlctl_get_supported_features(cxlfs, rpc_in, out_len); + case CXL_MBOX_OP_GET_FEATURE: + return cxlctl_get_feature(cxlfs, rpc_in, out_len); + case CXL_MBOX_OP_SET_FEATURE: + return cxlctl_set_feature(cxlfs, rpc_in, out_len); + default: + return ERR_PTR(-EOPNOTSUPP); + } +} + +static void *cxlctl_fw_rpc(struct fwctl_uctx *uctx, enum fwctl_rpc_scope scope, + void *in, size_t in_len, size_t *out_len) +{ + struct fwctl_device *fwctl_dev = uctx->fwctl; + struct cxl_memdev *cxlmd = fwctl_to_memdev(fwctl_dev); + struct cxl_features_state *cxlfs = to_cxlfs(cxlmd->cxlds); + const struct fwctl_rpc_cxl *rpc_in = in; + u16 opcode = rpc_in->opcode; + + if (!cxlctl_validate_hw_command(cxlfs, rpc_in, scope, opcode)) + return ERR_PTR(-EINVAL); + + return cxlctl_handle_commands(cxlfs, rpc_in, out_len, opcode); +} + +static const struct fwctl_ops cxlctl_ops = { + .device_type = FWCTL_DEVICE_TYPE_CXL, + .uctx_size = sizeof(struct fwctl_uctx), + .open_uctx = cxlctl_open_uctx, + .close_uctx = cxlctl_close_uctx, + .fw_rpc = cxlctl_fw_rpc, +}; + +DEFINE_FREE(free_fwctl_dev, struct fwctl_device *, if (_T) fwctl_put(_T)) + +static void free_memdev_fwctl(void *_fwctl_dev) +{ + struct fwctl_device *fwctl_dev = _fwctl_dev; + + fwctl_unregister(fwctl_dev); + fwctl_put(fwctl_dev); +} + +int devm_cxl_setup_fwctl(struct device *host, struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_features_state *cxlfs; + int rc; + + cxlfs = to_cxlfs(cxlds); + if (!cxlfs) + return -ENODEV; + + /* No need to setup FWCTL if there are no user allowed features found */ + if (!cxlfs->entries->num_user_features) + return -ENODEV; + + struct fwctl_device *fwctl_dev __free(free_fwctl_dev) = + _fwctl_alloc_device(&cxlmd->dev, &cxlctl_ops, sizeof(*fwctl_dev)); + if (!fwctl_dev) + return -ENOMEM; + + rc = fwctl_register(fwctl_dev); + if (rc) + return rc; + + return devm_add_action_or_reset(host, free_memdev_fwctl, + no_free_ptr(fwctl_dev)); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fwctl, "CXL"); + +MODULE_IMPORT_NS("FWCTL"); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c new file mode 100644 index 000000000000..1c5d2022c87a --- /dev/null +++ b/drivers/cxl/core/hdm.c @@ -0,0 +1,1287 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/seq_file.h> +#include <linux/device.h> +#include <linux/delay.h> + +#include "cxlmem.h" +#include "core.h" + +/** + * DOC: cxl core hdm + * + * Compute Express Link Host Managed Device Memory, starting with the + * CXL 2.0 specification, is managed by an array of HDM Decoder register + * instances per CXL port and per CXL endpoint. Define common helpers + * for enumerating these registers and capabilities. + */ + +struct cxl_rwsem cxl_rwsem = { + .region = __RWSEM_INITIALIZER(cxl_rwsem.region), + .dpa = __RWSEM_INITIALIZER(cxl_rwsem.dpa), +}; + +static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld) +{ + int rc; + + rc = cxl_decoder_add_locked(cxld); + if (rc) { + put_device(&cxld->dev); + dev_err(&port->dev, "Failed to add decoder\n"); + return rc; + } + + rc = cxl_decoder_autoremove(&port->dev, cxld); + if (rc) + return rc; + + dev_dbg(port->uport_dev, "%s added to %s\n", + dev_name(&cxld->dev), dev_name(&port->dev)); + + return 0; +} + +/* + * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure) + * single ported host-bridges need not publish a decoder capability when a + * passthrough decode can be assumed, i.e. all transactions that the uport sees + * are claimed and passed to the single dport. Disable the range until the first + * CXL region is enumerated / activated. + */ +static int devm_cxl_add_passthrough_decoder(struct cxl_port *port) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + + /* + * Capability checks are moot for passthrough decoders, support + * any and all possibilities. + */ + cxlhdm->interleave_mask = ~0U; + cxlhdm->iw_cap_mask = ~0UL; + + cxlsd = cxl_switch_decoder_alloc(port, 1); + if (IS_ERR(cxlsd)) + return PTR_ERR(cxlsd); + + device_lock_assert(&port->dev); + + return add_hdm_decoder(port, &cxlsd->cxld); +} + +static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm) +{ + u32 hdm_cap; + + hdm_cap = readl(cxlhdm->regs.hdm_decoder + CXL_HDM_DECODER_CAP_OFFSET); + cxlhdm->decoder_count = cxl_hdm_decoder_count(hdm_cap); + cxlhdm->target_count = + FIELD_GET(CXL_HDM_DECODER_TARGET_COUNT_MASK, hdm_cap); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_11_8, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(11, 8); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(14, 12); + cxlhdm->iw_cap_mask = BIT(1) | BIT(2) | BIT(4) | BIT(8); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_3_6_12_WAY, hdm_cap)) + cxlhdm->iw_cap_mask |= BIT(3) | BIT(6) | BIT(12); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_16_WAY, hdm_cap)) + cxlhdm->iw_cap_mask |= BIT(16); +} + +static bool should_emulate_decoders(struct cxl_endpoint_dvsec_info *info) +{ + struct cxl_hdm *cxlhdm; + void __iomem *hdm; + u32 ctrl; + int i; + + if (!info) + return false; + + cxlhdm = dev_get_drvdata(&info->port->dev); + hdm = cxlhdm->regs.hdm_decoder; + + if (!hdm) + return true; + + /* + * If HDM decoders are present and the driver is in control of + * Mem_Enable skip DVSEC based emulation + */ + if (!info->mem_enabled) + return false; + + /* + * If any decoders are committed already, there should not be any + * emulated DVSEC decoders. + */ + for (i = 0; i < cxlhdm->decoder_count; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + dev_dbg(&info->port->dev, + "decoder%d.%d: committed: %ld base: %#x_%.8x size: %#x_%.8x\n", + info->port->id, i, + FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl), + readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i)), + readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(i)), + readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i)), + readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i))); + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl)) + return false; + } + + return true; +} + +/** + * devm_cxl_setup_hdm - map HDM decoder component registers + * @port: cxl_port to map + * @info: cached DVSEC range register info + */ +static struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port, + struct cxl_endpoint_dvsec_info *info) +{ + struct cxl_register_map *reg_map = &port->reg_map; + struct device *dev = &port->dev; + struct cxl_hdm *cxlhdm; + int rc; + + cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL); + if (!cxlhdm) + return ERR_PTR(-ENOMEM); + cxlhdm->port = port; + dev_set_drvdata(dev, cxlhdm); + + /* Memory devices can configure device HDM using DVSEC range regs. */ + if (reg_map->resource == CXL_RESOURCE_NONE) { + if (!info || !info->mem_enabled) { + dev_err(dev, "No component registers mapped\n"); + return ERR_PTR(-ENXIO); + } + + cxlhdm->decoder_count = info->ranges; + return cxlhdm; + } + + if (!reg_map->component_map.hdm_decoder.valid) { + dev_dbg(&port->dev, "HDM decoder registers not implemented\n"); + /* unique error code to indicate no HDM decoder capability */ + return ERR_PTR(-ENODEV); + } + + rc = cxl_map_component_regs(reg_map, &cxlhdm->regs, + BIT(CXL_CM_CAP_CAP_ID_HDM)); + if (rc) { + dev_err(dev, "Failed to map HDM capability.\n"); + return ERR_PTR(rc); + } + + parse_hdm_decoder_caps(cxlhdm); + if (cxlhdm->decoder_count == 0) { + dev_err(dev, "Spec violation. Caps invalid\n"); + return ERR_PTR(-ENXIO); + } + + /* + * Now that the hdm capability is parsed, decide if range + * register emulation is needed and fixup cxlhdm accordingly. + */ + if (should_emulate_decoders(info)) { + dev_dbg(dev, "Fallback map %d range register%s\n", info->ranges, + str_plural(info->ranges)); + cxlhdm->decoder_count = info->ranges; + } + + return cxlhdm; +} + +static void __cxl_dpa_debug(struct seq_file *file, struct resource *r, int depth) +{ + unsigned long long start = r->start, end = r->end; + + seq_printf(file, "%*s%08llx-%08llx : %s\n", depth * 2, "", start, end, + r->name); +} + +void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds) +{ + struct resource *p1, *p2; + + guard(rwsem_read)(&cxl_rwsem.dpa); + for (p1 = cxlds->dpa_res.child; p1; p1 = p1->sibling) { + __cxl_dpa_debug(file, p1, 0); + for (p2 = p1->child; p2; p2 = p2->sibling) + __cxl_dpa_debug(file, p2, 1); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL"); + +/* See request_skip() kernel-doc */ +static resource_size_t __adjust_skip(struct cxl_dev_state *cxlds, + const resource_size_t skip_base, + const resource_size_t skip_len, + const char *requester) +{ + const resource_size_t skip_end = skip_base + skip_len - 1; + + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *part_res = &cxlds->part[i].res; + resource_size_t adjust_start, adjust_end, size; + + adjust_start = max(skip_base, part_res->start); + adjust_end = min(skip_end, part_res->end); + + if (adjust_end < adjust_start) + continue; + + size = adjust_end - adjust_start + 1; + + if (!requester) + __release_region(&cxlds->dpa_res, adjust_start, size); + else if (!__request_region(&cxlds->dpa_res, adjust_start, size, + requester, 0)) + return adjust_start - skip_base; + } + + return skip_len; +} +#define release_skip(c, b, l) __adjust_skip((c), (b), (l), NULL) + +/* + * Must be called in a context that synchronizes against this decoder's + * port ->remove() callback (like an endpoint decoder sysfs attribute) + */ +static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct resource *res = cxled->dpa_res; + resource_size_t skip_start; + + lockdep_assert_held_write(&cxl_rwsem.dpa); + + /* save @skip_start, before @res is released */ + skip_start = res->start - cxled->skip; + __release_region(&cxlds->dpa_res, res->start, resource_size(res)); + if (cxled->skip) + release_skip(cxlds, skip_start, cxled->skip); + cxled->skip = 0; + cxled->dpa_res = NULL; + put_device(&cxled->cxld.dev); + port->hdm_end--; +} + +static void cxl_dpa_release(void *cxled) +{ + guard(rwsem_write)(&cxl_rwsem.dpa); + __cxl_dpa_release(cxled); +} + +/* + * Must be called from context that will not race port device + * unregistration, like decoder sysfs attribute methods + */ +static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + + lockdep_assert_held_write(&cxl_rwsem.dpa); + devm_remove_action(&port->dev, cxl_dpa_release, cxled); + __cxl_dpa_release(cxled); +} + +/** + * request_skip() - Track DPA 'skip' in @cxlds->dpa_res resource tree + * @cxlds: CXL.mem device context that parents @cxled + * @cxled: Endpoint decoder establishing new allocation that skips lower DPA + * @skip_base: DPA < start of new DPA allocation (DPAnew) + * @skip_len: @skip_base + @skip_len == DPAnew + * + * DPA 'skip' arises from out-of-sequence DPA allocation events relative + * to free capacity across multiple partitions. It is a wasteful event + * as usable DPA gets thrown away, but if a deployment has, for example, + * a dual RAM+PMEM device, wants to use PMEM, and has unallocated RAM + * DPA, the free RAM DPA must be sacrificed to start allocating PMEM. + * See third "Implementation Note" in CXL 3.1 8.2.4.19.13 "Decoder + * Protection" for more details. + * + * A 'skip' always covers the last allocated DPA in a previous partition + * to the start of the current partition to allocate. Allocations never + * start in the middle of a partition, and allocations are always + * de-allocated in reverse order (see cxl_dpa_free(), or natural devm + * unwind order from forced in-order allocation). + * + * If @cxlds->nr_partitions was guaranteed to be <= 2 then the 'skip' + * would always be contained to a single partition. Given + * @cxlds->nr_partitions may be > 2 it results in cases where the 'skip' + * might span "tail capacity of partition[0], all of partition[1], ..., + * all of partition[N-1]" to support allocating from partition[N]. That + * in turn interacts with the partition 'struct resource' boundaries + * within @cxlds->dpa_res whereby 'skip' requests need to be divided by + * partition. I.e. this is a quirk of using a 'struct resource' tree to + * detect range conflicts while also tracking partition boundaries in + * @cxlds->dpa_res. + */ +static int request_skip(struct cxl_dev_state *cxlds, + struct cxl_endpoint_decoder *cxled, + const resource_size_t skip_base, + const resource_size_t skip_len) +{ + resource_size_t skipped = __adjust_skip(cxlds, skip_base, skip_len, + dev_name(&cxled->cxld.dev)); + + if (skipped == skip_len) + return 0; + + dev_dbg(cxlds->dev, + "%s: failed to reserve skipped space (%pa %pa %pa)\n", + dev_name(&cxled->cxld.dev), &skip_base, &skip_len, &skipped); + + release_skip(cxlds, skip_base, skipped); + + return -EBUSY; +} + +static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &port->dev; + struct resource *res; + int rc; + + lockdep_assert_held_write(&cxl_rwsem.dpa); + + if (!len) { + dev_warn(dev, "decoder%d.%d: empty reservation attempted\n", + port->id, cxled->cxld.id); + return -EINVAL; + } + + if (cxled->dpa_res) { + dev_dbg(dev, "decoder%d.%d: existing allocation %pr assigned\n", + port->id, cxled->cxld.id, cxled->dpa_res); + return -EBUSY; + } + + if (port->hdm_end + 1 != cxled->cxld.id) { + /* + * Assumes alloc and commit order is always in hardware instance + * order per expectations from 8.2.5.12.20 Committing Decoder + * Programming that enforce decoder[m] committed before + * decoder[m+1] commit start. + */ + dev_dbg(dev, "decoder%d.%d: expected decoder%d.%d\n", port->id, + cxled->cxld.id, port->id, port->hdm_end + 1); + return -EBUSY; + } + + if (skipped) { + rc = request_skip(cxlds, cxled, base - skipped, skipped); + if (rc) + return rc; + } + res = __request_region(&cxlds->dpa_res, base, len, + dev_name(&cxled->cxld.dev), 0); + if (!res) { + dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n", + port->id, cxled->cxld.id); + if (skipped) + release_skip(cxlds, base - skipped, skipped); + return -EBUSY; + } + cxled->dpa_res = res; + cxled->skip = skipped; + + /* + * When allocating new capacity, ->part is already set, when + * discovering decoder settings at initial enumeration, ->part + * is not set. + */ + if (cxled->part < 0) + for (int i = 0; cxlds->nr_partitions; i++) + if (resource_contains(&cxlds->part[i].res, res)) { + cxled->part = i; + break; + } + + if (cxled->part < 0) + dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n", + port->id, cxled->cxld.id, res); + + port->hdm_end++; + get_device(&cxled->cxld.dev); + return 0; +} + +static int add_dpa_res(struct device *dev, struct resource *parent, + struct resource *res, resource_size_t start, + resource_size_t size, const char *type) +{ + int rc; + + *res = (struct resource) { + .name = type, + .start = start, + .end = start + size - 1, + .flags = IORESOURCE_MEM, + }; + if (resource_size(res) == 0) { + dev_dbg(dev, "DPA(%s): no capacity\n", res->name); + return 0; + } + rc = request_resource(parent, res); + if (rc) { + dev_err(dev, "DPA(%s): failed to track %pr (%d)\n", res->name, + res, rc); + return rc; + } + + dev_dbg(dev, "DPA(%s): %pr\n", res->name, res); + + return 0; +} + +static const char *cxl_mode_name(enum cxl_partition_mode mode) +{ + switch (mode) { + case CXL_PARTMODE_RAM: + return "ram"; + case CXL_PARTMODE_PMEM: + return "pmem"; + default: + return ""; + }; +} + +/* if this fails the caller must destroy @cxlds, there is no recovery */ +int cxl_dpa_setup(struct cxl_dev_state *cxlds, const struct cxl_dpa_info *info) +{ + struct device *dev = cxlds->dev; + + guard(rwsem_write)(&cxl_rwsem.dpa); + + if (cxlds->nr_partitions) + return -EBUSY; + + if (!info->size || !info->nr_partitions) { + cxlds->dpa_res = DEFINE_RES_MEM(0, 0); + cxlds->nr_partitions = 0; + return 0; + } + + cxlds->dpa_res = DEFINE_RES_MEM(0, info->size); + + for (int i = 0; i < info->nr_partitions; i++) { + const struct cxl_dpa_part_info *part = &info->part[i]; + int rc; + + cxlds->part[i].perf.qos_class = CXL_QOS_CLASS_INVALID; + cxlds->part[i].mode = part->mode; + + /* Require ordered + contiguous partitions */ + if (i) { + const struct cxl_dpa_part_info *prev = &info->part[i - 1]; + + if (prev->range.end + 1 != part->range.start) + return -EINVAL; + } + rc = add_dpa_res(dev, &cxlds->dpa_res, &cxlds->part[i].res, + part->range.start, range_len(&part->range), + cxl_mode_name(part->mode)); + if (rc) + return rc; + cxlds->nr_partitions++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cxl_dpa_setup); + +int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, + resource_size_t base, resource_size_t len, + resource_size_t skipped) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + scoped_guard(rwsem_write, &cxl_rwsem.dpa) + rc = __cxl_dpa_reserve(cxled, base, len, skipped); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_dpa_reserve, "CXL"); + +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) +{ + guard(rwsem_read)(&cxl_rwsem.dpa); + if (cxled->dpa_res) + return resource_size(cxled->dpa_res); + + return 0; +} + +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t base = -1; + + lockdep_assert_held(&cxl_rwsem.dpa); + if (cxled->dpa_res) + base = cxled->dpa_res->start; + + return base; +} + +bool cxl_resource_contains_addr(const struct resource *res, const resource_size_t addr) +{ + struct resource _addr = DEFINE_RES_MEM(addr, 1); + + return resource_contains(res, &_addr); +} + +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + struct device *dev = &cxled->cxld.dev; + + guard(rwsem_write)(&cxl_rwsem.dpa); + if (!cxled->dpa_res) + return 0; + if (cxled->cxld.region) { + dev_dbg(dev, "decoder assigned to: %s\n", + dev_name(&cxled->cxld.region->dev)); + return -EBUSY; + } + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + return -EBUSY; + } + if (cxled->cxld.id != port->hdm_end) { + dev_dbg(dev, "expected decoder%d.%d\n", port->id, + port->hdm_end); + return -EBUSY; + } + + devm_cxl_dpa_release(cxled); + return 0; +} + +int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled, + enum cxl_partition_mode mode) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + int part; + + guard(rwsem_write)(&cxl_rwsem.dpa); + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) + return -EBUSY; + + for (part = 0; part < cxlds->nr_partitions; part++) + if (cxlds->part[part].mode == mode) + break; + + if (part >= cxlds->nr_partitions) { + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; + } + + if (!resource_size(&cxlds->part[part].res)) { + dev_dbg(dev, "no available capacity for mode: %d\n", mode); + return -ENXIO; + } + + cxled->part = part; + return 0; +} + +static int __cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + struct resource *res, *prev = NULL; + resource_size_t start, avail, skip, skip_start; + struct resource *p, *last; + int part; + + guard(rwsem_write)(&cxl_rwsem.dpa); + if (cxled->cxld.region) { + dev_dbg(dev, "decoder attached to %s\n", + dev_name(&cxled->cxld.region->dev)); + return -EBUSY; + } + + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + return -EBUSY; + } + + part = cxled->part; + if (part < 0) { + dev_dbg(dev, "partition not set\n"); + return -EBUSY; + } + + res = &cxlds->part[part].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + start = last->end + 1; + else + start = res->start; + + /* + * To allocate at partition N, a skip needs to be calculated for all + * unallocated space at lower partitions indices. + * + * If a partition has any allocations, the search can end because a + * previous cxl_dpa_alloc() invocation is assumed to have accounted for + * all previous partitions. + */ + skip_start = CXL_RESOURCE_NONE; + for (int i = part; i; i--) { + prev = &cxlds->part[i - 1].res; + for (p = prev->child, last = NULL; p; p = p->sibling) + last = p; + if (last) { + skip_start = last->end + 1; + break; + } + skip_start = prev->start; + } + + avail = res->end - start + 1; + if (skip_start == CXL_RESOURCE_NONE) + skip = 0; + else + skip = res->start - skip_start; + + if (size > avail) { + dev_dbg(dev, "%llu exceeds available %s capacity: %llu\n", size, + res->name, (u64)avail); + return -ENOSPC; + } + + return __cxl_dpa_reserve(cxled, start, size, skip); +} + +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size) +{ + struct cxl_port *port = cxled_to_port(cxled); + int rc; + + rc = __cxl_dpa_alloc(cxled, size); + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} + +static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) +{ + u16 eig; + u8 eiw; + + /* + * Input validation ensures these warns never fire, but otherwise + * suppress unititalized variable usage warnings. + */ + if (WARN_ONCE(ways_to_eiw(cxld->interleave_ways, &eiw), + "invalid interleave_ways: %d\n", cxld->interleave_ways)) + return; + if (WARN_ONCE(granularity_to_eig(cxld->interleave_granularity, &eig), + "invalid interleave_granularity: %d\n", + cxld->interleave_granularity)) + return; + + u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK); + u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK); + *ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; +} + +static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl) +{ + u32p_replace_bits(ctrl, + !!(cxld->target_type == CXL_DECODER_HOSTONLYMEM), + CXL_HDM_DECODER0_CTRL_HOSTONLY); +} + +static void cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt) +{ + struct cxl_dport **t = &cxlsd->target[0]; + int ways = cxlsd->cxld.interleave_ways; + + *tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id); + if (ways > 1) + *tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id); + if (ways > 2) + *tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id); + if (ways > 3) + *tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id); + if (ways > 4) + *tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id); + if (ways > 5) + *tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id); + if (ways > 6) + *tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id); + if (ways > 7) + *tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id); +} + +/* + * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set + * committed or error within 10ms, but just be generous with 20ms to account for + * clock skew and other marginal behavior + */ +#define COMMIT_TIMEOUT_MS 20 +static int cxld_await_commit(void __iomem *hdm, int id) +{ + u32 ctrl; + int i; + + for (i = 0; i < COMMIT_TIMEOUT_MS; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + return -EIO; + } + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl)) + return 0; + fsleep(1000); + } + + return -ETIMEDOUT; +} + +static void setup_hw_decoder(struct cxl_decoder *cxld, void __iomem *hdm) +{ + int id = cxld->id; + u64 base, size; + u32 ctrl; + + /* common decoder settings */ + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); + cxld_set_interleave(cxld, &ctrl); + cxld_set_type(cxld, &ctrl); + base = cxld->hpa_range.start; + size = range_len(&cxld->hpa_range); + + writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id); + void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id); + u64 targets; + + cxlsd_set_targets(cxlsd, &targets); + writel(upper_32_bits(targets), tl_hi); + writel(lower_32_bits(targets), tl_lo); + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id); + void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id); + + writel(upper_32_bits(cxled->skip), sk_hi); + writel(lower_32_bits(cxled->skip), sk_lo); + } + + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); +} + +static int cxl_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id, rc; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + if (cxl_num_decoders_committed(port) != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, + cxl_num_decoders_committed(port)); + return -EBUSY; + } + + /* + * For endpoint decoders hosted on CXL memory devices that + * support the sanitize operation, make sure sanitize is not in-flight. + */ + if (is_endpoint_decoder(&cxld->dev)) { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_memdev_state *mds = + to_cxl_memdev_state(cxlmd->cxlds); + + if (mds && mds->security.sanitize_active) { + dev_dbg(&cxlmd->dev, + "attempted to commit %s during sanitize\n", + dev_name(&cxld->dev)); + return -EBUSY; + } + } + + scoped_guard(rwsem_read, &cxl_rwsem.dpa) + setup_hw_decoder(cxld, hdm); + + port->commit_end++; + rc = cxld_await_commit(hdm, cxld->id); + if (rc) { + dev_dbg(&port->dev, "%s: error %d committing decoder\n", + dev_name(&cxld->dev), rc); + cxld->reset(cxld); + return rc; + } + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int commit_reap(struct device *dev, void *data) +{ + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev) && !is_endpoint_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + if (port->commit_end == cxld->id && + ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { + port->commit_end--; + dev_dbg(&port->dev, "reap: %s commit_end: %d\n", + dev_name(&cxld->dev), port->commit_end); + } + + return 0; +} + +void cxl_port_commit_reap(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + lockdep_assert_held_write(&cxl_rwsem.region); + + /* + * Once the highest committed decoder is disabled, free any other + * decoders that were pinned allocated by out-of-order release. + */ + port->commit_end--; + dev_dbg(&port->dev, "reap: %s commit_end: %d\n", dev_name(&cxld->dev), + port->commit_end); + device_for_each_child_reverse_from(&port->dev, &cxld->dev, NULL, + commit_reap); +} +EXPORT_SYMBOL_NS_GPL(cxl_port_commit_reap, "CXL"); + +static void cxl_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id; + u32 ctrl; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return; + + if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + + if (port->commit_end == id) + cxl_port_commit_reap(cxld); + else + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + + writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + /* Userspace is now responsible for reconfiguring this decoder */ + if (is_endpoint_decoder(&cxld->dev)) { + struct cxl_endpoint_decoder *cxled; + + cxled = to_cxl_endpoint_decoder(&cxld->dev); + cxled->state = CXL_DECODER_STATE_MANUAL; + } +} + +static int cxl_setup_hdm_decoder_from_dvsec( + struct cxl_port *port, struct cxl_decoder *cxld, u64 *dpa_base, + int which, struct cxl_endpoint_dvsec_info *info) +{ + struct cxl_endpoint_decoder *cxled; + u64 len; + int rc; + + if (!is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + cxled = to_cxl_endpoint_decoder(&cxld->dev); + len = range_len(&info->dvsec_range[which]); + if (!len) + return -ENOENT; + + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->commit = NULL; + cxld->reset = NULL; + cxld->hpa_range = info->dvsec_range[which]; + + /* + * Set the emulated decoder as locked pending additional support to + * change the range registers at run time. + */ + cxld->flags |= CXL_DECODER_F_ENABLE | CXL_DECODER_F_LOCK; + port->commit_end = cxld->id; + + rc = devm_cxl_dpa_reserve(cxled, *dpa_base, len, 0); + if (rc) { + dev_err(&port->dev, + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + port->id, cxld->id, *dpa_base, *dpa_base + len - 1, rc); + return rc; + } + *dpa_base += len; + cxled->state = CXL_DECODER_STATE_AUTO; + + return 0; +} + +static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, + void __iomem *hdm, int which, + u64 *dpa_base, struct cxl_endpoint_dvsec_info *info) +{ + struct cxl_endpoint_decoder *cxled = NULL; + u64 size, base, skip, dpa_size, lo, hi; + bool committed; + u32 remainder; + int i, rc; + u32 ctrl; + union { + u64 value; + unsigned char target_id[8]; + } target_list; + + if (should_emulate_decoders(info)) + return cxl_setup_hdm_decoder_from_dvsec(port, cxld, dpa_base, + which, info); + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + lo = readl(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); + hi = readl(hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(which)); + base = (hi << 32) + lo; + lo = readl(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); + hi = readl(hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(which)); + size = (hi << 32) + lo; + committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED); + cxld->commit = cxl_decoder_commit; + cxld->reset = cxl_decoder_reset; + + if (!committed) + size = 0; + if (base == U64_MAX || size == U64_MAX) { + dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n", + port->id, cxld->id); + return -ENXIO; + } + + if (info) + cxled = to_cxl_endpoint_decoder(&cxld->dev); + cxld->hpa_range = (struct range) { + .start = base, + .end = base + size - 1, + }; + + /* decoders are enabled if committed */ + if (committed) { + cxld->flags |= CXL_DECODER_F_ENABLE; + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + cxld->flags |= CXL_DECODER_F_LOCK; + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl)) + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + else + cxld->target_type = CXL_DECODER_DEVMEM; + + guard(rwsem_write)(&cxl_rwsem.region); + if (cxld->id != cxl_num_decoders_committed(port)) { + dev_warn(&port->dev, + "decoder%d.%d: Committed out of order\n", + port->id, cxld->id); + return -ENXIO; + } + + if (size == 0) { + dev_warn(&port->dev, + "decoder%d.%d: Committed with zero size\n", + port->id, cxld->id); + return -ENXIO; + } + port->commit_end = cxld->id; + } else { + if (cxled) { + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + /* + * Default by devtype until a device arrives that needs + * more precision. + */ + if (cxlds->type == CXL_DEVTYPE_CLASSMEM) + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + else + cxld->target_type = CXL_DECODER_DEVMEM; + } else { + /* To be overridden by region type at commit time */ + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + } + + if (!FIELD_GET(CXL_HDM_DECODER0_CTRL_HOSTONLY, ctrl) && + cxld->target_type == CXL_DECODER_HOSTONLYMEM) { + ctrl |= CXL_HDM_DECODER0_CTRL_HOSTONLY; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + } + } + rc = eiw_to_ways(FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl), + &cxld->interleave_ways); + if (rc) { + dev_warn(&port->dev, + "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n", + port->id, cxld->id, ctrl); + return rc; + } + rc = eig_to_granularity(FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl), + &cxld->interleave_granularity); + if (rc) { + dev_warn(&port->dev, + "decoder%d.%d: Invalid interleave granularity (ctrl: %#x)\n", + port->id, cxld->id, ctrl); + return rc; + } + + dev_dbg(&port->dev, "decoder%d.%d: range: %#llx-%#llx iw: %d ig: %d\n", + port->id, cxld->id, cxld->hpa_range.start, cxld->hpa_range.end, + cxld->interleave_ways, cxld->interleave_granularity); + + if (!cxled) { + lo = readl(hdm + CXL_HDM_DECODER0_TL_LOW(which)); + hi = readl(hdm + CXL_HDM_DECODER0_TL_HIGH(which)); + target_list.value = (hi << 32) + lo; + for (i = 0; i < cxld->interleave_ways; i++) + cxld->target_map[i] = target_list.target_id[i]; + + return 0; + } + + if (!committed) + return 0; + + dpa_size = div_u64_rem(size, cxld->interleave_ways, &remainder); + if (remainder) { + dev_err(&port->dev, + "decoder%d.%d: invalid committed configuration size: %#llx ways: %d\n", + port->id, cxld->id, size, cxld->interleave_ways); + return -ENXIO; + } + lo = readl(hdm + CXL_HDM_DECODER0_SKIP_LOW(which)); + hi = readl(hdm + CXL_HDM_DECODER0_SKIP_HIGH(which)); + skip = (hi << 32) + lo; + rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); + if (rc) { + dev_err(&port->dev, + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + port->id, cxld->id, *dpa_base, + *dpa_base + dpa_size + skip - 1, rc); + return rc; + } + *dpa_base += dpa_size + skip; + + cxled->state = CXL_DECODER_STATE_AUTO; + + return 0; +} + +static void cxl_settle_decoders(struct cxl_hdm *cxlhdm) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int committed, i; + u32 ctrl; + + if (!hdm) + return; + + /* + * Since the register resource was recently claimed via request_region() + * be careful about trusting the "not-committed" status until the commit + * timeout has elapsed. The commit timeout is 10ms (CXL 2.0 + * 8.2.5.12.20), but double it to be tolerant of any clock skew between + * host and target. + */ + for (i = 0, committed = 0; i < cxlhdm->decoder_count; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + committed++; + } + + /* ensure that future checks of committed can be trusted */ + if (committed != cxlhdm->decoder_count) + msleep(20); +} + +/** + * devm_cxl_enumerate_decoders - add decoder objects per HDM register set + * @cxlhdm: Structure to populate with HDM capabilities + * @info: cached DVSEC range register info + */ +static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + struct cxl_port *port = cxlhdm->port; + int i; + u64 dpa_base = 0; + + cxl_settle_decoders(cxlhdm); + + for (i = 0; i < cxlhdm->decoder_count; i++) { + int rc, target_count = cxlhdm->target_count; + struct cxl_decoder *cxld; + + if (is_cxl_endpoint(port)) { + struct cxl_endpoint_decoder *cxled; + + cxled = cxl_endpoint_decoder_alloc(port); + if (IS_ERR(cxled)) { + dev_warn(&port->dev, + "Failed to allocate decoder%d.%d\n", + port->id, i); + return PTR_ERR(cxled); + } + cxld = &cxled->cxld; + } else { + struct cxl_switch_decoder *cxlsd; + + cxlsd = cxl_switch_decoder_alloc(port, target_count); + if (IS_ERR(cxlsd)) { + dev_warn(&port->dev, + "Failed to allocate decoder%d.%d\n", + port->id, i); + return PTR_ERR(cxlsd); + } + cxld = &cxlsd->cxld; + } + + rc = init_hdm_decoder(port, cxld, hdm, i, &dpa_base, info); + if (rc) { + dev_warn(&port->dev, + "Failed to initialize decoder%d.%d\n", + port->id, i); + put_device(&cxld->dev); + return rc; + } + rc = add_hdm_decoder(port, cxld); + if (rc) { + dev_warn(&port->dev, + "Failed to add decoder%d.%d\n", port->id, i); + return rc; + } + } + + return 0; +} + +/** + * __devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders + * @port: CXL port context + * + * Return 0 or -errno on error + */ +int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +{ + struct cxl_hdm *cxlhdm; + + if (is_cxl_root(port) || is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + cxlhdm = devm_cxl_setup_hdm(port, NULL); + if (!IS_ERR(cxlhdm)) + return devm_cxl_enumerate_decoders(cxlhdm, NULL); + + if (PTR_ERR(cxlhdm) != -ENODEV) { + dev_err(&port->dev, "Failed to map HDM decoder capability\n"); + return PTR_ERR(cxlhdm); + } + + if (cxl_port_get_possible_dports(port) == 1) { + dev_dbg(&port->dev, "Fallback to passthrough decoder\n"); + return devm_cxl_add_passthrough_decoder(port); + } + + dev_err(&port->dev, "HDM decoder capability not found\n"); + return -ENXIO; +} +EXPORT_SYMBOL_NS_GPL(__devm_cxl_switch_port_decoders_setup, "CXL"); + +/** + * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders + * @port: CXL port context + * + * Return 0 or -errno on error + */ +int devm_cxl_endpoint_decoders_setup(struct cxl_port *port) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_endpoint_dvsec_info info = { .port = port }; + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_hdm *cxlhdm; + int rc; + + if (!is_cxl_endpoint(port)) + return -EOPNOTSUPP; + + rc = cxl_dvsec_rr_decode(cxlds, &info); + if (rc < 0) + return rc; + + cxlhdm = devm_cxl_setup_hdm(port, &info); + if (IS_ERR(cxlhdm)) { + if (PTR_ERR(cxlhdm) == -ENODEV) + dev_err(&port->dev, "HDM decoder registers not found\n"); + return PTR_ERR(cxlhdm); + } + + rc = cxl_hdm_decode_init(cxlds, cxlhdm, &info); + if (rc) + return rc; + + return devm_cxl_enumerate_decoders(cxlhdm, &info); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_endpoint_decoders_setup, "CXL"); diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c new file mode 100644 index 000000000000..fa6dd0c94656 --- /dev/null +++ b/drivers/cxl/core/mbox.c @@ -0,0 +1,1552 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include <linux/security.h> +#include <linux/debugfs.h> +#include <linux/ktime.h> +#include <linux/mutex.h> +#include <linux/unaligned.h> +#include <cxlpci.h> +#include <cxlmem.h> +#include <cxl.h> + +#include "core.h" +#include "trace.h" +#include "mce.h" + +static bool cxl_raw_allow_all; + +/** + * DOC: cxl mbox + * + * Core implementation of the CXL 2.0 Type-3 Memory Device Mailbox. The + * implementation is used by the cxl_pci driver to initialize the device + * and implement the cxl_mem.h IOCTL UAPI. It also implements the + * backend of the cxl_pmem_ctl() transport for LIBNVDIMM. + */ + +#define cxl_for_each_cmd(cmd) \ + for ((cmd) = &cxl_mem_commands[0]; \ + ((cmd) - cxl_mem_commands) < ARRAY_SIZE(cxl_mem_commands); (cmd)++) + +#define CXL_CMD(_id, sin, sout, _flags) \ + [CXL_MEM_COMMAND_ID_##_id] = { \ + .info = { \ + .id = CXL_MEM_COMMAND_ID_##_id, \ + .size_in = sin, \ + .size_out = sout, \ + }, \ + .opcode = CXL_MBOX_OP_##_id, \ + .flags = _flags, \ + } + +#define CXL_VARIABLE_PAYLOAD ~0U +/* + * This table defines the supported mailbox commands for the driver. This table + * is made up of a UAPI structure. Non-negative values as parameters in the + * table will be validated against the user's input. For example, if size_in is + * 0, and the user passed in 1, it is an error. + */ +static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = { + CXL_CMD(IDENTIFY, 0, 0x43, CXL_CMD_FLAG_FORCE_ENABLE), +#ifdef CONFIG_CXL_MEM_RAW_COMMANDS + CXL_CMD(RAW, CXL_VARIABLE_PAYLOAD, CXL_VARIABLE_PAYLOAD, 0), +#endif + CXL_CMD(GET_SUPPORTED_LOGS, 0, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE), + CXL_CMD(GET_FW_INFO, 0, 0x50, 0), + CXL_CMD(GET_PARTITION_INFO, 0, 0x20, 0), + CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0), + CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0), + CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE), + CXL_CMD(GET_LOG_CAPS, 0x10, 0x4, 0), + CXL_CMD(CLEAR_LOG, 0x10, 0, 0), + CXL_CMD(GET_SUP_LOG_SUBLIST, 0x2, CXL_VARIABLE_PAYLOAD, 0), + CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0), + CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0), + CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0), + CXL_CMD(SET_ALERT_CONFIG, 0xc, 0, 0), + CXL_CMD(GET_SHUTDOWN_STATE, 0, 0x1, 0), + CXL_CMD(SET_SHUTDOWN_STATE, 0x1, 0, 0), + CXL_CMD(GET_SCAN_MEDIA_CAPS, 0x10, 0x4, 0), + CXL_CMD(GET_TIMESTAMP, 0, 0x8, 0), +}; + +/* + * Commands that RAW doesn't permit. The rationale for each: + * + * CXL_MBOX_OP_ACTIVATE_FW: Firmware activation requires adjustment / + * coordination of transaction timeout values at the root bridge level. + * + * CXL_MBOX_OP_SET_PARTITION_INFO: The device memory map may change live + * and needs to be coordinated with HDM updates. + * + * CXL_MBOX_OP_SET_LSA: The label storage area may be cached by the + * driver and any writes from userspace invalidates those contents. + * + * CXL_MBOX_OP_SET_SHUTDOWN_STATE: Set shutdown state assumes no writes + * to the device after it is marked clean, userspace can not make that + * assertion. + * + * CXL_MBOX_OP_[GET_]SCAN_MEDIA: The kernel provides a native error list that + * is kept up to date with patrol notifications and error management. + * + * CXL_MBOX_OP_[GET_,INJECT_,CLEAR_]POISON: These commands require kernel + * driver orchestration for safety. + */ +static u16 cxl_disabled_raw_commands[] = { + CXL_MBOX_OP_ACTIVATE_FW, + CXL_MBOX_OP_SET_PARTITION_INFO, + CXL_MBOX_OP_SET_LSA, + CXL_MBOX_OP_SET_SHUTDOWN_STATE, + CXL_MBOX_OP_SCAN_MEDIA, + CXL_MBOX_OP_GET_SCAN_MEDIA, + CXL_MBOX_OP_GET_POISON, + CXL_MBOX_OP_INJECT_POISON, + CXL_MBOX_OP_CLEAR_POISON, +}; + +/* + * Command sets that RAW doesn't permit. All opcodes in this set are + * disabled because they pass plain text security payloads over the + * user/kernel boundary. This functionality is intended to be wrapped + * behind the keys ABI which allows for encrypted payloads in the UAPI + */ +static u8 security_command_sets[] = { + 0x44, /* Sanitize */ + 0x45, /* Persistent Memory Data-at-rest Security */ + 0x46, /* Security Passthrough */ +}; + +static bool cxl_is_security_command(u16 opcode) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(security_command_sets); i++) + if (security_command_sets[i] == (opcode >> 8)) + return true; + return false; +} + +static void cxl_set_security_cmd_enabled(struct cxl_security_state *security, + u16 opcode) +{ + switch (opcode) { + case CXL_MBOX_OP_SANITIZE: + set_bit(CXL_SEC_ENABLED_SANITIZE, security->enabled_cmds); + break; + case CXL_MBOX_OP_SECURE_ERASE: + set_bit(CXL_SEC_ENABLED_SECURE_ERASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_GET_SECURITY_STATE: + set_bit(CXL_SEC_ENABLED_GET_SECURITY_STATE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_SET_PASSPHRASE: + set_bit(CXL_SEC_ENABLED_SET_PASSPHRASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_DISABLE_PASSPHRASE: + set_bit(CXL_SEC_ENABLED_DISABLE_PASSPHRASE, + security->enabled_cmds); + break; + case CXL_MBOX_OP_UNLOCK: + set_bit(CXL_SEC_ENABLED_UNLOCK, security->enabled_cmds); + break; + case CXL_MBOX_OP_FREEZE_SECURITY: + set_bit(CXL_SEC_ENABLED_FREEZE_SECURITY, + security->enabled_cmds); + break; + case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE: + set_bit(CXL_SEC_ENABLED_PASSPHRASE_SECURE_ERASE, + security->enabled_cmds); + break; + default: + break; + } +} + +static bool cxl_is_poison_command(u16 opcode) +{ +#define CXL_MBOX_OP_POISON_CMDS 0x43 + + if ((opcode >> 8) == CXL_MBOX_OP_POISON_CMDS) + return true; + + return false; +} + +static void cxl_set_poison_cmd_enabled(struct cxl_poison_state *poison, + u16 opcode) +{ + switch (opcode) { + case CXL_MBOX_OP_GET_POISON: + set_bit(CXL_POISON_ENABLED_LIST, poison->enabled_cmds); + break; + case CXL_MBOX_OP_INJECT_POISON: + set_bit(CXL_POISON_ENABLED_INJECT, poison->enabled_cmds); + break; + case CXL_MBOX_OP_CLEAR_POISON: + set_bit(CXL_POISON_ENABLED_CLEAR, poison->enabled_cmds); + break; + case CXL_MBOX_OP_GET_SCAN_MEDIA_CAPS: + set_bit(CXL_POISON_ENABLED_SCAN_CAPS, poison->enabled_cmds); + break; + case CXL_MBOX_OP_SCAN_MEDIA: + set_bit(CXL_POISON_ENABLED_SCAN_MEDIA, poison->enabled_cmds); + break; + case CXL_MBOX_OP_GET_SCAN_MEDIA: + set_bit(CXL_POISON_ENABLED_SCAN_RESULTS, poison->enabled_cmds); + break; + default: + break; + } +} + +static struct cxl_mem_command *cxl_mem_find_command(u16 opcode) +{ + struct cxl_mem_command *c; + + cxl_for_each_cmd(c) + if (c->opcode == opcode) + return c; + + return NULL; +} + +static const char *cxl_mem_opcode_to_name(u16 opcode) +{ + struct cxl_mem_command *c; + + c = cxl_mem_find_command(opcode); + if (!c) + return NULL; + + return cxl_command_names[c->info.id].name; +} + +/** + * cxl_internal_send_cmd() - Kernel internal interface to send a mailbox command + * @cxl_mbox: CXL mailbox context + * @mbox_cmd: initialized command to execute + * + * Context: Any context. + * Return: + * * %>=0 - Number of bytes returned in @out. + * * %-E2BIG - Payload is too large for hardware. + * * %-EBUSY - Couldn't acquire exclusive mailbox access. + * * %-EFAULT - Hardware error occurred. + * * %-ENXIO - Command completed, but device reported an error. + * * %-EIO - Unexpected output size. + * + * Mailbox commands may execute successfully yet the device itself reported an + * error. While this distinction can be useful for commands from userspace, the + * kernel will only be able to use results when both are successful. + */ +int cxl_internal_send_cmd(struct cxl_mailbox *cxl_mbox, + struct cxl_mbox_cmd *mbox_cmd) +{ + size_t out_size, min_out; + int rc; + + if (mbox_cmd->size_in > cxl_mbox->payload_size || + mbox_cmd->size_out > cxl_mbox->payload_size) + return -E2BIG; + + out_size = mbox_cmd->size_out; + min_out = mbox_cmd->min_out; + rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd); + /* + * EIO is reserved for a payload size mismatch and mbox_send() + * may not return this error. + */ + if (WARN_ONCE(rc == -EIO, "Bad return code: -EIO")) + return -ENXIO; + if (rc) + return rc; + + if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS && + mbox_cmd->return_code != CXL_MBOX_CMD_RC_BACKGROUND) + return cxl_mbox_cmd_rc2errno(mbox_cmd); + + if (!out_size) + return 0; + + /* + * Variable sized output needs to at least satisfy the caller's + * minimum if not the fully requested size. + */ + if (min_out == 0) + min_out = out_size; + + if (mbox_cmd->size_out < min_out) + return -EIO; + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_internal_send_cmd, "CXL"); + +static bool cxl_mem_raw_command_allowed(u16 opcode) +{ + int i; + + if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS)) + return false; + + if (security_locked_down(LOCKDOWN_PCI_ACCESS)) + return false; + + if (cxl_raw_allow_all) + return true; + + if (cxl_is_security_command(opcode)) + return false; + + for (i = 0; i < ARRAY_SIZE(cxl_disabled_raw_commands); i++) + if (cxl_disabled_raw_commands[i] == opcode) + return false; + + return true; +} + +/** + * cxl_payload_from_user_allowed() - Check contents of in_payload. + * @opcode: The mailbox command opcode. + * @payload_in: Pointer to the input payload passed in from user space. + * + * Return: + * * true - payload_in passes check for @opcode. + * * false - payload_in contains invalid or unsupported values. + * + * The driver may inspect payload contents before sending a mailbox + * command from user space to the device. The intent is to reject + * commands with input payloads that are known to be unsafe. This + * check is not intended to replace the users careful selection of + * mailbox command parameters and makes no guarantee that the user + * command will succeed, nor that it is appropriate. + * + * The specific checks are determined by the opcode. + */ +static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in) +{ + switch (opcode) { + case CXL_MBOX_OP_SET_PARTITION_INFO: { + struct cxl_mbox_set_partition_info *pi = payload_in; + + if (pi->flags & CXL_SET_PARTITION_IMMEDIATE_FLAG) + return false; + break; + } + case CXL_MBOX_OP_CLEAR_LOG: { + const uuid_t *uuid = (uuid_t *)payload_in; + + /* + * Restrict the ‘Clear log’ action to only apply to + * Vendor debug logs. + */ + return uuid_equal(uuid, &DEFINE_CXL_VENDOR_DEBUG_UUID); + } + default: + break; + } + return true; +} + +static int cxl_mbox_cmd_ctor(struct cxl_mbox_cmd *mbox_cmd, + struct cxl_mailbox *cxl_mbox, u16 opcode, + size_t in_size, size_t out_size, u64 in_payload) +{ + *mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = opcode, + .size_in = in_size, + }; + + if (in_size) { + mbox_cmd->payload_in = vmemdup_user(u64_to_user_ptr(in_payload), + in_size); + if (IS_ERR(mbox_cmd->payload_in)) + return PTR_ERR(mbox_cmd->payload_in); + + if (!cxl_payload_from_user_allowed(opcode, mbox_cmd->payload_in)) { + dev_dbg(cxl_mbox->host, "%s: input payload not allowed\n", + cxl_mem_opcode_to_name(opcode)); + kvfree(mbox_cmd->payload_in); + return -EBUSY; + } + } + + /* Prepare to handle a full payload for variable sized output */ + if (out_size == CXL_VARIABLE_PAYLOAD) + mbox_cmd->size_out = cxl_mbox->payload_size; + else + mbox_cmd->size_out = out_size; + + if (mbox_cmd->size_out) { + mbox_cmd->payload_out = kvzalloc(mbox_cmd->size_out, GFP_KERNEL); + if (!mbox_cmd->payload_out) { + kvfree(mbox_cmd->payload_in); + return -ENOMEM; + } + } + return 0; +} + +static void cxl_mbox_cmd_dtor(struct cxl_mbox_cmd *mbox) +{ + kvfree(mbox->payload_in); + kvfree(mbox->payload_out); +} + +static int cxl_to_mem_cmd_raw(struct cxl_mem_command *mem_cmd, + const struct cxl_send_command *send_cmd, + struct cxl_mailbox *cxl_mbox) +{ + if (send_cmd->raw.rsvd) + return -EINVAL; + + /* + * Unlike supported commands, the output size of RAW commands + * gets passed along without further checking, so it must be + * validated here. + */ + if (send_cmd->out.size > cxl_mbox->payload_size) + return -EINVAL; + + if (!cxl_mem_raw_command_allowed(send_cmd->raw.opcode)) + return -EPERM; + + dev_WARN_ONCE(cxl_mbox->host, true, "raw command path used\n"); + + *mem_cmd = (struct cxl_mem_command) { + .info = { + .id = CXL_MEM_COMMAND_ID_RAW, + .size_in = send_cmd->in.size, + .size_out = send_cmd->out.size, + }, + .opcode = send_cmd->raw.opcode + }; + + return 0; +} + +static int cxl_to_mem_cmd(struct cxl_mem_command *mem_cmd, + const struct cxl_send_command *send_cmd, + struct cxl_mailbox *cxl_mbox) +{ + struct cxl_mem_command *c = &cxl_mem_commands[send_cmd->id]; + const struct cxl_command_info *info = &c->info; + + if (send_cmd->flags & ~CXL_MEM_COMMAND_FLAG_MASK) + return -EINVAL; + + if (send_cmd->rsvd) + return -EINVAL; + + if (send_cmd->in.rsvd || send_cmd->out.rsvd) + return -EINVAL; + + /* Check that the command is enabled for hardware */ + if (!test_bit(info->id, cxl_mbox->enabled_cmds)) + return -ENOTTY; + + /* Check that the command is not claimed for exclusive kernel use */ + if (test_bit(info->id, cxl_mbox->exclusive_cmds)) + return -EBUSY; + + /* Check the input buffer is the expected size */ + if ((info->size_in != CXL_VARIABLE_PAYLOAD) && + (info->size_in != send_cmd->in.size)) + return -ENOMEM; + + /* Check the output buffer is at least large enough */ + if ((info->size_out != CXL_VARIABLE_PAYLOAD) && + (send_cmd->out.size < info->size_out)) + return -ENOMEM; + + *mem_cmd = (struct cxl_mem_command) { + .info = { + .id = info->id, + .flags = info->flags, + .size_in = send_cmd->in.size, + .size_out = send_cmd->out.size, + }, + .opcode = c->opcode + }; + + return 0; +} + +/** + * cxl_validate_cmd_from_user() - Check fields for CXL_MEM_SEND_COMMAND. + * @mbox_cmd: Sanitized and populated &struct cxl_mbox_cmd. + * @cxl_mbox: CXL mailbox context + * @send_cmd: &struct cxl_send_command copied in from userspace. + * + * Return: + * * %0 - @out_cmd is ready to send. + * * %-ENOTTY - Invalid command specified. + * * %-EINVAL - Reserved fields or invalid values were used. + * * %-ENOMEM - Input or output buffer wasn't sized properly. + * * %-EPERM - Attempted to use a protected command. + * * %-EBUSY - Kernel has claimed exclusive access to this opcode + * + * The result of this command is a fully validated command in @mbox_cmd that is + * safe to send to the hardware. + */ +static int cxl_validate_cmd_from_user(struct cxl_mbox_cmd *mbox_cmd, + struct cxl_mailbox *cxl_mbox, + const struct cxl_send_command *send_cmd) +{ + struct cxl_mem_command mem_cmd; + int rc; + + if (send_cmd->id == 0 || send_cmd->id >= CXL_MEM_COMMAND_ID_MAX) + return -ENOTTY; + + /* + * The user can never specify an input payload larger than what hardware + * supports, but output can be arbitrarily large (simply write out as + * much data as the hardware provides). + */ + if (send_cmd->in.size > cxl_mbox->payload_size) + return -EINVAL; + + /* Sanitize and construct a cxl_mem_command */ + if (send_cmd->id == CXL_MEM_COMMAND_ID_RAW) + rc = cxl_to_mem_cmd_raw(&mem_cmd, send_cmd, cxl_mbox); + else + rc = cxl_to_mem_cmd(&mem_cmd, send_cmd, cxl_mbox); + + if (rc) + return rc; + + /* Sanitize and construct a cxl_mbox_cmd */ + return cxl_mbox_cmd_ctor(mbox_cmd, cxl_mbox, mem_cmd.opcode, + mem_cmd.info.size_in, mem_cmd.info.size_out, + send_cmd->in.payload); +} + +int cxl_query_cmd(struct cxl_mailbox *cxl_mbox, + struct cxl_mem_query_commands __user *q) +{ + struct device *dev = cxl_mbox->host; + struct cxl_mem_command *cmd; + u32 n_commands; + int j = 0; + + dev_dbg(dev, "Query IOCTL\n"); + + if (get_user(n_commands, &q->n_commands)) + return -EFAULT; + + /* returns the total number if 0 elements are requested. */ + if (n_commands == 0) + return put_user(ARRAY_SIZE(cxl_mem_commands), &q->n_commands); + + /* + * otherwise, return min(n_commands, total commands) cxl_command_info + * structures. + */ + cxl_for_each_cmd(cmd) { + struct cxl_command_info info = cmd->info; + + if (test_bit(info.id, cxl_mbox->enabled_cmds)) + info.flags |= CXL_MEM_COMMAND_FLAG_ENABLED; + if (test_bit(info.id, cxl_mbox->exclusive_cmds)) + info.flags |= CXL_MEM_COMMAND_FLAG_EXCLUSIVE; + + if (copy_to_user(&q->commands[j++], &info, sizeof(info))) + return -EFAULT; + + if (j == n_commands) + break; + } + + return 0; +} + +/** + * handle_mailbox_cmd_from_user() - Dispatch a mailbox command for userspace. + * @cxl_mbox: The mailbox context for the operation. + * @mbox_cmd: The validated mailbox command. + * @out_payload: Pointer to userspace's output payload. + * @size_out: (Input) Max payload size to copy out. + * (Output) Payload size hardware generated. + * @retval: Hardware generated return code from the operation. + * + * Return: + * * %0 - Mailbox transaction succeeded. This implies the mailbox + * protocol completed successfully not that the operation itself + * was successful. + * * %-ENOMEM - Couldn't allocate a bounce buffer. + * * %-EFAULT - Something happened with copy_to/from_user. + * * %-EINTR - Mailbox acquisition interrupted. + * * %-EXXX - Transaction level failures. + * + * Dispatches a mailbox command on behalf of a userspace request. + * The output payload is copied to userspace. + * + * See cxl_send_cmd(). + */ +static int handle_mailbox_cmd_from_user(struct cxl_mailbox *cxl_mbox, + struct cxl_mbox_cmd *mbox_cmd, + u64 out_payload, s32 *size_out, + u32 *retval) +{ + struct device *dev = cxl_mbox->host; + int rc; + + dev_dbg(dev, + "Submitting %s command for user\n" + "\topcode: %x\n" + "\tsize: %zx\n", + cxl_mem_opcode_to_name(mbox_cmd->opcode), + mbox_cmd->opcode, mbox_cmd->size_in); + + rc = cxl_mbox->mbox_send(cxl_mbox, mbox_cmd); + if (rc) + goto out; + + /* + * @size_out contains the max size that's allowed to be written back out + * to userspace. While the payload may have written more output than + * this it will have to be ignored. + */ + if (mbox_cmd->size_out) { + dev_WARN_ONCE(dev, mbox_cmd->size_out > *size_out, + "Invalid return size\n"); + if (copy_to_user(u64_to_user_ptr(out_payload), + mbox_cmd->payload_out, mbox_cmd->size_out)) { + rc = -EFAULT; + goto out; + } + } + + *size_out = mbox_cmd->size_out; + *retval = mbox_cmd->return_code; + +out: + cxl_mbox_cmd_dtor(mbox_cmd); + return rc; +} + +int cxl_send_cmd(struct cxl_mailbox *cxl_mbox, struct cxl_send_command __user *s) +{ + struct device *dev = cxl_mbox->host; + struct cxl_send_command send; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + dev_dbg(dev, "Send IOCTL\n"); + + if (copy_from_user(&send, s, sizeof(send))) + return -EFAULT; + + rc = cxl_validate_cmd_from_user(&mbox_cmd, cxl_mbox, &send); + if (rc) + return rc; + + rc = handle_mailbox_cmd_from_user(cxl_mbox, &mbox_cmd, send.out.payload, + &send.out.size, &send.retval); + if (rc) + return rc; + + if (copy_to_user(s, &send, sizeof(send))) + return -EFAULT; + + return 0; +} + +static int cxl_xfer_log(struct cxl_memdev_state *mds, uuid_t *uuid, + u32 *size, u8 *out) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + u32 remaining = *size; + u32 offset = 0; + + while (remaining) { + u32 xfer_size = min_t(u32, remaining, cxl_mbox->payload_size); + struct cxl_mbox_cmd mbox_cmd; + struct cxl_mbox_get_log log; + int rc; + + log = (struct cxl_mbox_get_log) { + .uuid = *uuid, + .offset = cpu_to_le32(offset), + .length = cpu_to_le32(xfer_size), + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_LOG, + .size_in = sizeof(log), + .payload_in = &log, + .size_out = xfer_size, + .payload_out = out, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + + /* + * The output payload length that indicates the number + * of valid bytes can be smaller than the Log buffer + * size. + */ + if (rc == -EIO && mbox_cmd.size_out < xfer_size) { + offset += mbox_cmd.size_out; + break; + } + + if (rc < 0) + return rc; + + out += xfer_size; + remaining -= xfer_size; + offset += xfer_size; + } + + *size = offset; + + return 0; +} + +static int check_features_opcodes(u16 opcode, int *ro_cmds, int *wr_cmds) +{ + switch (opcode) { + case CXL_MBOX_OP_GET_SUPPORTED_FEATURES: + case CXL_MBOX_OP_GET_FEATURE: + (*ro_cmds)++; + return 1; + case CXL_MBOX_OP_SET_FEATURE: + (*wr_cmds)++; + return 1; + default: + return 0; + } +} + +/* 'Get Supported Features' and 'Get Feature' */ +#define MAX_FEATURES_READ_CMDS 2 +static void set_features_cap(struct cxl_mailbox *cxl_mbox, + int ro_cmds, int wr_cmds) +{ + /* Setting up Features capability while walking the CEL */ + if (ro_cmds == MAX_FEATURES_READ_CMDS) { + if (wr_cmds) + cxl_mbox->feat_cap = CXL_FEATURES_RW; + else + cxl_mbox->feat_cap = CXL_FEATURES_RO; + } +} + +/** + * cxl_walk_cel() - Walk through the Command Effects Log. + * @mds: The driver data for the operation + * @size: Length of the Command Effects Log. + * @cel: CEL + * + * Iterate over each entry in the CEL and determine if the driver supports the + * command. If so, the command is enabled for the device and can be used later. + */ +static void cxl_walk_cel(struct cxl_memdev_state *mds, size_t size, u8 *cel) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_cel_entry *cel_entry; + const int cel_entries = size / sizeof(*cel_entry); + struct device *dev = mds->cxlds.dev; + int i, ro_cmds = 0, wr_cmds = 0; + + cel_entry = (struct cxl_cel_entry *) cel; + + for (i = 0; i < cel_entries; i++) { + u16 opcode = le16_to_cpu(cel_entry[i].opcode); + struct cxl_mem_command *cmd = cxl_mem_find_command(opcode); + int enabled = 0; + + if (cmd) { + set_bit(cmd->info.id, cxl_mbox->enabled_cmds); + enabled++; + } + + enabled += check_features_opcodes(opcode, &ro_cmds, + &wr_cmds); + + if (cxl_is_poison_command(opcode)) { + cxl_set_poison_cmd_enabled(&mds->poison, opcode); + enabled++; + } + + if (cxl_is_security_command(opcode)) { + cxl_set_security_cmd_enabled(&mds->security, opcode); + enabled++; + } + + dev_dbg(dev, "Opcode 0x%04x %s\n", opcode, + enabled ? "enabled" : "unsupported by driver"); + } + + set_features_cap(cxl_mbox, ro_cmds, wr_cmds); +} + +static struct cxl_mbox_get_supported_logs *cxl_get_gsl(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_supported_logs *ret; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + ret = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_SUPPORTED_LOGS, + .size_out = cxl_mbox->payload_size, + .payload_out = ret, + /* At least the record number field must be valid */ + .min_out = 2, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) { + kvfree(ret); + return ERR_PTR(rc); + } + + + return ret; +} + +enum { + CEL_UUID, + VENDOR_DEBUG_UUID, +}; + +/* See CXL 2.0 Table 170. Get Log Input Payload */ +static const uuid_t log_uuid[] = { + [CEL_UUID] = DEFINE_CXL_CEL_UUID, + [VENDOR_DEBUG_UUID] = DEFINE_CXL_VENDOR_DEBUG_UUID, +}; + +/** + * cxl_enumerate_cmds() - Enumerate commands for a device. + * @mds: The driver data for the operation + * + * Returns 0 if enumerate completed successfully. + * + * CXL devices have optional support for certain commands. This function will + * determine the set of supported commands for the hardware and update the + * enabled_cmds bitmap in the @mds. + */ +int cxl_enumerate_cmds(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_supported_logs *gsl; + struct device *dev = mds->cxlds.dev; + struct cxl_mem_command *cmd; + int i, rc; + + gsl = cxl_get_gsl(mds); + if (IS_ERR(gsl)) + return PTR_ERR(gsl); + + rc = -ENOENT; + for (i = 0; i < le16_to_cpu(gsl->entries); i++) { + u32 size = le32_to_cpu(gsl->entry[i].size); + uuid_t uuid = gsl->entry[i].uuid; + u8 *log; + + dev_dbg(dev, "Found LOG type %pU of size %d", &uuid, size); + + if (!uuid_equal(&uuid, &log_uuid[CEL_UUID])) + continue; + + log = kvmalloc(size, GFP_KERNEL); + if (!log) { + rc = -ENOMEM; + goto out; + } + + rc = cxl_xfer_log(mds, &uuid, &size, log); + if (rc) { + kvfree(log); + goto out; + } + + cxl_walk_cel(mds, size, log); + kvfree(log); + + /* In case CEL was bogus, enable some default commands. */ + cxl_for_each_cmd(cmd) + if (cmd->flags & CXL_CMD_FLAG_FORCE_ENABLE) + set_bit(cmd->info.id, cxl_mbox->enabled_cmds); + + /* Found the required CEL */ + rc = 0; + } +out: + kvfree(gsl); + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, "CXL"); + +void cxl_event_trace_record(const struct cxl_memdev *cxlmd, + enum cxl_event_log_type type, + enum cxl_event_type event_type, + const uuid_t *uuid, union cxl_event *evt) +{ + if (event_type == CXL_CPER_EVENT_MEM_MODULE) { + trace_cxl_memory_module(cxlmd, type, &evt->mem_module); + return; + } + if (event_type == CXL_CPER_EVENT_GENERIC) { + trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic); + return; + } + if (event_type == CXL_CPER_EVENT_MEM_SPARING) { + trace_cxl_memory_sparing(cxlmd, type, &evt->mem_sparing); + return; + } + + if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) { + u64 dpa, hpa = ULLONG_MAX, hpa_alias = ULLONG_MAX; + struct cxl_region *cxlr; + + /* + * These trace points are annotated with HPA and region + * translations. Take topology mutation locks and lookup + * { HPA, REGION } from { DPA, MEMDEV } in the event record. + */ + guard(rwsem_read)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + + dpa = le64_to_cpu(evt->media_hdr.phys_addr) & CXL_DPA_MASK; + cxlr = cxl_dpa_to_region(cxlmd, dpa); + if (cxlr) { + u64 cache_size = cxlr->params.cache_size; + + hpa = cxl_dpa_to_hpa(cxlr, cxlmd, dpa); + if (cache_size) + hpa_alias = hpa - cache_size; + } + + if (event_type == CXL_CPER_EVENT_GEN_MEDIA) { + if (cxl_store_rec_gen_media((struct cxl_memdev *)cxlmd, evt)) + dev_dbg(&cxlmd->dev, "CXL store rec_gen_media failed\n"); + + if (evt->gen_media.media_hdr.descriptor & + CXL_GMER_EVT_DESC_THRESHOLD_EVENT) + WARN_ON_ONCE((evt->gen_media.media_hdr.type & + CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE) && + !get_unaligned_le24(evt->gen_media.cme_count)); + else + WARN_ON_ONCE(evt->gen_media.media_hdr.type & + CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE); + + trace_cxl_general_media(cxlmd, type, cxlr, hpa, + hpa_alias, &evt->gen_media); + } else if (event_type == CXL_CPER_EVENT_DRAM) { + if (cxl_store_rec_dram((struct cxl_memdev *)cxlmd, evt)) + dev_dbg(&cxlmd->dev, "CXL store rec_dram failed\n"); + + if (evt->dram.media_hdr.descriptor & + CXL_GMER_EVT_DESC_THRESHOLD_EVENT) + WARN_ON_ONCE((evt->dram.media_hdr.type & + CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE) && + !get_unaligned_le24(evt->dram.cvme_count)); + else + WARN_ON_ONCE(evt->dram.media_hdr.type & + CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE); + + trace_cxl_dram(cxlmd, type, cxlr, hpa, hpa_alias, + &evt->dram); + } + } +} +EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, "CXL"); + +static void __cxl_event_trace_record(const struct cxl_memdev *cxlmd, + enum cxl_event_log_type type, + struct cxl_event_record_raw *record) +{ + enum cxl_event_type ev_type = CXL_CPER_EVENT_GENERIC; + const uuid_t *uuid = &record->id; + + if (uuid_equal(uuid, &CXL_EVENT_GEN_MEDIA_UUID)) + ev_type = CXL_CPER_EVENT_GEN_MEDIA; + else if (uuid_equal(uuid, &CXL_EVENT_DRAM_UUID)) + ev_type = CXL_CPER_EVENT_DRAM; + else if (uuid_equal(uuid, &CXL_EVENT_MEM_MODULE_UUID)) + ev_type = CXL_CPER_EVENT_MEM_MODULE; + else if (uuid_equal(uuid, &CXL_EVENT_MEM_SPARING_UUID)) + ev_type = CXL_CPER_EVENT_MEM_SPARING; + + cxl_event_trace_record(cxlmd, type, ev_type, uuid, &record->event); +} + +static int cxl_clear_event_record(struct cxl_memdev_state *mds, + enum cxl_event_log_type log, + struct cxl_get_event_payload *get_pl) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_clear_event_payload *payload; + u16 total = le16_to_cpu(get_pl->record_count); + u8 max_handles = CXL_CLEAR_EVENT_MAX_HANDLES; + size_t pl_size = struct_size(payload, handles, max_handles); + struct cxl_mbox_cmd mbox_cmd; + u16 cnt; + int rc = 0; + int i; + + /* Payload size may limit the max handles */ + if (pl_size > cxl_mbox->payload_size) { + max_handles = (cxl_mbox->payload_size - sizeof(*payload)) / + sizeof(__le16); + pl_size = struct_size(payload, handles, max_handles); + } + + payload = kvzalloc(pl_size, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + *payload = (struct cxl_mbox_clear_event_payload) { + .event_log = log, + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_CLEAR_EVENT_RECORD, + .payload_in = payload, + .size_in = pl_size, + }; + + /* + * Clear Event Records uses u8 for the handle cnt while Get Event + * Record can return up to 0xffff records. + */ + i = 0; + for (cnt = 0; cnt < total; cnt++) { + struct cxl_event_record_raw *raw = &get_pl->records[cnt]; + struct cxl_event_generic *gen = &raw->event.generic; + + payload->handles[i++] = gen->hdr.handle; + dev_dbg(mds->cxlds.dev, "Event log '%d': Clearing %u\n", log, + le16_to_cpu(payload->handles[i - 1])); + + if (i == max_handles) { + payload->nr_recs = i; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + goto free_pl; + i = 0; + } + } + + /* Clear what is left if any */ + if (i) { + payload->nr_recs = i; + mbox_cmd.size_in = struct_size(payload, handles, i); + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + goto free_pl; + } + +free_pl: + kvfree(payload); + return rc; +} + +static void cxl_mem_get_records_log(struct cxl_memdev_state *mds, + enum cxl_event_log_type type) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; + struct device *dev = mds->cxlds.dev; + struct cxl_get_event_payload *payload; + u8 log_type = type; + u16 nr_rec; + + mutex_lock(&mds->event.log_lock); + payload = mds->event.buf; + + do { + int rc, i; + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_EVENT_RECORD, + .payload_in = &log_type, + .size_in = sizeof(log_type), + .payload_out = payload, + .size_out = cxl_mbox->payload_size, + .min_out = struct_size(payload, records, 0), + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) { + dev_err_ratelimited(dev, + "Event log '%d': Failed to query event records : %d", + type, rc); + break; + } + + nr_rec = le16_to_cpu(payload->record_count); + if (!nr_rec) + break; + + for (i = 0; i < nr_rec; i++) + __cxl_event_trace_record(cxlmd, type, + &payload->records[i]); + + if (payload->flags & CXL_GET_EVENT_FLAG_OVERFLOW) + trace_cxl_overflow(cxlmd, type, payload); + + rc = cxl_clear_event_record(mds, type, payload); + if (rc) { + dev_err_ratelimited(dev, + "Event log '%d': Failed to clear events : %d", + type, rc); + break; + } + } while (nr_rec); + + mutex_unlock(&mds->event.log_lock); +} + +/** + * cxl_mem_get_event_records - Get Event Records from the device + * @mds: The driver data for the operation + * @status: Event Status register value identifying which events are available. + * + * Retrieve all event records available on the device, report them as trace + * events, and clear them. + * + * See CXL rev 3.0 @8.2.9.2.2 Get Event Records + * See CXL rev 3.0 @8.2.9.2.3 Clear Event Records + */ +void cxl_mem_get_event_records(struct cxl_memdev_state *mds, u32 status) +{ + dev_dbg(mds->cxlds.dev, "Reading event logs: %x\n", status); + + if (status & CXLDEV_EVENT_STATUS_FATAL) + cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FATAL); + if (status & CXLDEV_EVENT_STATUS_FAIL) + cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_FAIL); + if (status & CXLDEV_EVENT_STATUS_WARN) + cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_WARN); + if (status & CXLDEV_EVENT_STATUS_INFO) + cxl_mem_get_records_log(mds, CXL_EVENT_TYPE_INFO); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL"); + +/** + * cxl_mem_get_partition_info - Get partition info + * @mds: The driver data for the operation + * + * Retrieve the current partition info for the device specified. The active + * values are the current capacity in bytes. If not 0, the 'next' values are + * the pending values, in bytes, which take affect on next cold reset. + * + * Return: 0 if no error: or the result of the mailbox command. + * + * See CXL @8.2.9.5.2.1 Get Partition Info + */ +static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_partition_info pi; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_PARTITION_INFO, + .size_out = sizeof(pi), + .payload_out = &pi, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + return rc; + + mds->active_volatile_bytes = + le64_to_cpu(pi.active_volatile_cap) * CXL_CAPACITY_MULTIPLIER; + mds->active_persistent_bytes = + le64_to_cpu(pi.active_persistent_cap) * CXL_CAPACITY_MULTIPLIER; + + return 0; +} + +/** + * cxl_dev_state_identify() - Send the IDENTIFY command to the device. + * @mds: The driver data for the operation + * + * Return: 0 if identify was executed successfully or media not ready. + * + * This will dispatch the identify command to the device and on success populate + * structures to be exported to sysfs. + */ +int cxl_dev_state_identify(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + /* See CXL 2.0 Table 175 Identify Memory Device Output Payload */ + struct cxl_mbox_identify id; + struct cxl_mbox_cmd mbox_cmd; + u32 val; + int rc; + + if (!mds->cxlds.media_ready) + return 0; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_IDENTIFY, + .size_out = sizeof(id), + .payload_out = &id, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) + return rc; + + mds->total_bytes = + le64_to_cpu(id.total_capacity) * CXL_CAPACITY_MULTIPLIER; + mds->volatile_only_bytes = + le64_to_cpu(id.volatile_capacity) * CXL_CAPACITY_MULTIPLIER; + mds->persistent_only_bytes = + le64_to_cpu(id.persistent_capacity) * CXL_CAPACITY_MULTIPLIER; + mds->partition_align_bytes = + le64_to_cpu(id.partition_align) * CXL_CAPACITY_MULTIPLIER; + + mds->lsa_size = le32_to_cpu(id.lsa_size); + memcpy(mds->firmware_version, id.fw_revision, + sizeof(id.fw_revision)); + + if (test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) { + val = get_unaligned_le24(id.poison_list_max_mer); + mds->poison.max_errors = min_t(u32, val, CXL_POISON_LIST_MAX); + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dev_state_identify, "CXL"); + +static int __cxl_mem_sanitize(struct cxl_memdev_state *mds, u16 cmd) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + int rc; + u32 sec_out = 0; + struct cxl_get_security_output { + __le32 flags; + } out; + struct cxl_mbox_cmd sec_cmd = { + .opcode = CXL_MBOX_OP_GET_SECURITY_STATE, + .payload_out = &out, + .size_out = sizeof(out), + }; + struct cxl_mbox_cmd mbox_cmd = { .opcode = cmd }; + + if (cmd != CXL_MBOX_OP_SANITIZE && cmd != CXL_MBOX_OP_SECURE_ERASE) + return -EINVAL; + + rc = cxl_internal_send_cmd(cxl_mbox, &sec_cmd); + if (rc < 0) { + dev_err(cxl_mbox->host, "Failed to get security state : %d", rc); + return rc; + } + + /* + * Prior to using these commands, any security applied to + * the user data areas of the device shall be DISABLED (or + * UNLOCKED for secure erase case). + */ + sec_out = le32_to_cpu(out.flags); + if (sec_out & CXL_PMEM_SEC_STATE_USER_PASS_SET) + return -EINVAL; + + if (cmd == CXL_MBOX_OP_SECURE_ERASE && + sec_out & CXL_PMEM_SEC_STATE_LOCKED) + return -EINVAL; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) { + dev_err(cxl_mbox->host, "Failed to sanitize device : %d", rc); + return rc; + } + + return 0; +} + + +/** + * cxl_mem_sanitize() - Send a sanitization command to the device. + * @cxlmd: The device for the operation + * @cmd: The specific sanitization command opcode + * + * Return: 0 if the command was executed successfully, regardless of + * whether or not the actual security operation is done in the background, + * such as for the Sanitize case. + * Error return values can be the result of the mailbox command, -EINVAL + * when security requirements are not met or invalid contexts, or -EBUSY + * if the sanitize operation is already in flight. + * + * See CXL 3.0 @8.2.9.8.5.1 Sanitize and @8.2.9.8.5.2 Secure Erase. + */ +int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_port *endpoint; + + /* synchronize with cxl_mem_probe() and decoder write operations */ + guard(device)(&cxlmd->dev); + endpoint = cxlmd->endpoint; + guard(rwsem_read)(&cxl_rwsem.region); + /* + * Require an endpoint to be safe otherwise the driver can not + * be sure that the device is unmapped. + */ + if (endpoint && cxl_num_decoders_committed(endpoint) == 0) + return __cxl_mem_sanitize(mds, cmd); + + return -EBUSY; +} + +static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode) +{ + int i = info->nr_partitions; + + if (size == 0) + return; + + info->part[i].range = (struct range) { + .start = start, + .end = start + size - 1, + }; + info->part[i].mode = mode; + info->nr_partitions++; +} + +int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info) +{ + struct cxl_dev_state *cxlds = &mds->cxlds; + struct device *dev = cxlds->dev; + int rc; + + if (!cxlds->media_ready) { + info->size = 0; + return 0; + } + + info->size = mds->total_bytes; + + if (mds->partition_align_bytes == 0) { + add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->volatile_only_bytes, + mds->persistent_only_bytes, CXL_PARTMODE_PMEM); + return 0; + } + + rc = cxl_mem_get_partition_info(mds); + if (rc) { + dev_err(dev, "Failed to query partition information\n"); + return rc; + } + + add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM); + add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes, + CXL_PARTMODE_PMEM); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL"); + +int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_health_info_out hi; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_HEALTH_INFO, + .size_out = sizeof(hi), + .payload_out = &hi, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (!rc) + *count = le32_to_cpu(hi.dirty_shutdown_cnt); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL"); + +int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_mbox_set_shutdown_state_in in = { + .state = 1 + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE, + .size_in = sizeof(in), + .payload_in = &in, + }; + + return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL"); + +int cxl_set_timestamp(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_mbox_set_timestamp_in pi; + int rc; + + pi.timestamp = cpu_to_le64(ktime_get_real_ns()); + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_SET_TIMESTAMP, + .size_in = sizeof(pi), + .payload_in = &pi, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + /* + * Command is optional. Devices may have another way of providing + * a timestamp, or may return all 0s in timestamp fields. + * Don't report an error if this command isn't supported + */ + if (rc && (mbox_cmd.return_code != CXL_MBOX_CMD_RC_UNSUPPORTED)) + return rc; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, "CXL"); + +int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len, + struct cxl_region *cxlr) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + struct cxl_mbox_poison_out *po; + struct cxl_mbox_poison_in pi; + int nr_records = 0; + int rc; + + ACQUIRE(mutex_intr, lock)(&mds->poison.mutex); + if ((rc = ACQUIRE_ERR(mutex_intr, &lock))) + return rc; + + po = mds->poison.list_out; + pi.offset = cpu_to_le64(offset); + pi.length = cpu_to_le64(len / CXL_POISON_LEN_MULT); + + do { + struct cxl_mbox_cmd mbox_cmd = (struct cxl_mbox_cmd){ + .opcode = CXL_MBOX_OP_GET_POISON, + .size_in = sizeof(pi), + .payload_in = &pi, + .size_out = cxl_mbox->payload_size, + .payload_out = po, + .min_out = struct_size(po, record, 0), + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + break; + + for (int i = 0; i < le16_to_cpu(po->count); i++) + trace_cxl_poison(cxlmd, cxlr, &po->record[i], + po->flags, po->overflow_ts, + CXL_POISON_TRACE_LIST); + + /* Protect against an uncleared _FLAG_MORE */ + nr_records = nr_records + le16_to_cpu(po->count); + if (nr_records >= mds->poison.max_errors) { + dev_dbg(&cxlmd->dev, "Max Error Records reached: %d\n", + nr_records); + break; + } + } while (po->flags & CXL_POISON_FLAG_MORE); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_get_poison, "CXL"); + +static void free_poison_buf(void *buf) +{ + kvfree(buf); +} + +/* Get Poison List output buffer is protected by mds->poison.lock */ +static int cxl_poison_alloc_buf(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + + mds->poison.list_out = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL); + if (!mds->poison.list_out) + return -ENOMEM; + + return devm_add_action_or_reset(mds->cxlds.dev, free_poison_buf, + mds->poison.list_out); +} + +int cxl_poison_state_init(struct cxl_memdev_state *mds) +{ + int rc; + + if (!test_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds)) + return 0; + + rc = cxl_poison_alloc_buf(mds); + if (rc) { + clear_bit(CXL_POISON_ENABLED_LIST, mds->poison.enabled_cmds); + return rc; + } + + mutex_init(&mds->poison.mutex); + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_poison_state_init, "CXL"); + +int cxl_mailbox_init(struct cxl_mailbox *cxl_mbox, struct device *host) +{ + if (!cxl_mbox || !host) + return -EINVAL; + + cxl_mbox->host = host; + mutex_init(&cxl_mbox->mbox_mutex); + rcuwait_init(&cxl_mbox->mbox_wait); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_mailbox_init, "CXL"); + +struct cxl_memdev_state *cxl_memdev_state_create(struct device *dev) +{ + struct cxl_memdev_state *mds; + int rc; + + mds = devm_kzalloc(dev, sizeof(*mds), GFP_KERNEL); + if (!mds) { + dev_err(dev, "No memory available\n"); + return ERR_PTR(-ENOMEM); + } + + mutex_init(&mds->event.log_lock); + mds->cxlds.dev = dev; + mds->cxlds.reg_map.host = dev; + mds->cxlds.cxl_mbox.host = dev; + mds->cxlds.reg_map.resource = CXL_RESOURCE_NONE; + mds->cxlds.type = CXL_DEVTYPE_CLASSMEM; + + rc = devm_cxl_register_mce_notifier(dev, &mds->mce_notifier); + if (rc == -EOPNOTSUPP) + dev_warn(dev, "CXL MCE unsupported\n"); + else if (rc) + return ERR_PTR(rc); + + return mds; +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_state_create, "CXL"); + +void __init cxl_mbox_init(void) +{ + struct dentry *mbox_debugfs; + + mbox_debugfs = cxl_debugfs_create_dir("mbox"); + debugfs_create_bool("raw_allow_all", 0600, mbox_debugfs, + &cxl_raw_allow_all); +} diff --git a/drivers/cxl/core/mce.c b/drivers/cxl/core/mce.c new file mode 100644 index 000000000000..ff8d078c6ca1 --- /dev/null +++ b/drivers/cxl/core/mce.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/set_memory.h> +#include <asm/mce.h> +#include <cxlmem.h> +#include "mce.h" + +static int cxl_handle_mce(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cxl_memdev_state *mds = container_of(nb, struct cxl_memdev_state, + mce_notifier); + struct cxl_memdev *cxlmd = mds->cxlds.cxlmd; + struct cxl_port *endpoint = cxlmd->endpoint; + struct mce *mce = data; + u64 spa, spa_alias; + unsigned long pfn; + + if (!mce || !mce_usable_address(mce)) + return NOTIFY_DONE; + + if (!endpoint) + return NOTIFY_DONE; + + spa = mce->addr & MCI_ADDR_PHYSADDR; + + pfn = spa >> PAGE_SHIFT; + if (!pfn_valid(pfn)) + return NOTIFY_DONE; + + spa_alias = cxl_port_get_spa_cache_alias(endpoint, spa); + if (spa_alias == ~0ULL) + return NOTIFY_DONE; + + pfn = spa_alias >> PAGE_SHIFT; + + /* + * Take down the aliased memory page. The original memory page flagged + * by the MCE will be taken cared of by the standard MCE handler. + */ + dev_emerg(mds->cxlds.dev, "Offlining aliased SPA address0: %#llx\n", + spa_alias); + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); + + return NOTIFY_OK; +} + +static void cxl_unregister_mce_notifier(void *mce_notifier) +{ + mce_unregister_decode_chain(mce_notifier); +} + +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + mce_notifier->notifier_call = cxl_handle_mce; + mce_notifier->priority = MCE_PRIO_UC; + mce_register_decode_chain(mce_notifier); + + return devm_add_action_or_reset(dev, cxl_unregister_mce_notifier, + mce_notifier); +} diff --git a/drivers/cxl/core/mce.h b/drivers/cxl/core/mce.h new file mode 100644 index 000000000000..ca272e8db6c7 --- /dev/null +++ b/drivers/cxl/core/mce.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright(c) 2024 Intel Corporation. All rights reserved. */ +#ifndef _CXL_CORE_MCE_H_ +#define _CXL_CORE_MCE_H_ + +#include <linux/notifier.h> + +#ifdef CONFIG_CXL_MCE +int devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier); +#else +static inline int +devm_cxl_register_mce_notifier(struct device *dev, + struct notifier_block *mce_notifier) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c new file mode 100644 index 000000000000..e370d733e440 --- /dev/null +++ b/drivers/cxl/core/memdev.c @@ -0,0 +1,1161 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ + +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/firmware.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/pci.h> +#include <cxlmem.h> +#include "trace.h" +#include "core.h" + +static DECLARE_RWSEM(cxl_memdev_rwsem); + +/* + * An entire PCI topology full of devices should be enough for any + * config + */ +#define CXL_MEM_MAX_DEVS 65536 + +static int cxl_mem_major; +static DEFINE_IDA(cxl_memdev_ida); + +static void cxl_memdev_release(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + + ida_free(&cxl_memdev_ida, cxlmd->id); + devm_cxl_memdev_edac_release(cxlmd); + kfree(cxlmd); +} + +static char *cxl_memdev_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, + kgid_t *gid) +{ + return kasprintf(GFP_KERNEL, "cxl/%s", dev_name(dev)); +} + +static ssize_t firmware_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + + if (!mds) + return sysfs_emit(buf, "\n"); + return sysfs_emit(buf, "%.16s\n", mds->firmware_version); +} +static DEVICE_ATTR_RO(firmware_version); + +static ssize_t payload_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + + if (!mds) + return sysfs_emit(buf, "\n"); + return sysfs_emit(buf, "%zu\n", cxlds->cxl_mbox.payload_size); +} +static DEVICE_ATTR_RO(payload_max); + +static ssize_t label_storage_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + + if (!mds) + return sysfs_emit(buf, "\n"); + return sysfs_emit(buf, "%zu\n", mds->lsa_size); +} +static DEVICE_ATTR_RO(label_storage_size); + +static resource_size_t cxl_ram_size(struct cxl_dev_state *cxlds) +{ + /* Static RAM is only expected at partition 0. */ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return 0; + return resource_size(&cxlds->part[0].res); +} + +static ssize_t ram_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + unsigned long long len = cxl_ram_size(cxlds); + + return sysfs_emit(buf, "%#llx\n", len); +} + +static struct device_attribute dev_attr_ram_size = + __ATTR(size, 0444, ram_size_show, NULL); + +static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + unsigned long long len = cxl_pmem_size(cxlds); + + return sysfs_emit(buf, "%#llx\n", len); +} + +static struct device_attribute dev_attr_pmem_size = + __ATTR(size, 0444, pmem_size_show, NULL); + +static ssize_t serial_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%#llx\n", cxlds->serial); +} +static DEVICE_ATTR_RO(serial); + +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static ssize_t security_state_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + unsigned long state = mds->security.state; + int rc = 0; + + /* sync with latest submission state */ + mutex_lock(&cxl_mbox->mbox_mutex); + if (mds->security.sanitize_active) + rc = sysfs_emit(buf, "sanitize\n"); + mutex_unlock(&cxl_mbox->mbox_mutex); + if (rc) + return rc; + + if (!(state & CXL_PMEM_SEC_STATE_USER_PASS_SET)) + return sysfs_emit(buf, "disabled\n"); + if (state & CXL_PMEM_SEC_STATE_FROZEN || + state & CXL_PMEM_SEC_STATE_MASTER_PLIMIT || + state & CXL_PMEM_SEC_STATE_USER_PLIMIT) + return sysfs_emit(buf, "frozen\n"); + if (state & CXL_PMEM_SEC_STATE_LOCKED) + return sysfs_emit(buf, "locked\n"); + + return sysfs_emit(buf, "unlocked\n"); +} +static struct device_attribute dev_attr_security_state = + __ATTR(state, 0444, security_state_show, NULL); + +static ssize_t security_sanitize_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + bool sanitize; + ssize_t rc; + + if (kstrtobool(buf, &sanitize) || !sanitize) + return -EINVAL; + + rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SANITIZE); + if (rc) + return rc; + + return len; +} +static struct device_attribute dev_attr_security_sanitize = + __ATTR(sanitize, 0200, NULL, security_sanitize_store); + +static ssize_t security_erase_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + ssize_t rc; + bool erase; + + if (kstrtobool(buf, &erase) || !erase) + return -EINVAL; + + rc = cxl_mem_sanitize(cxlmd, CXL_MBOX_OP_SECURE_ERASE); + if (rc) + return rc; + + return len; +} +static struct device_attribute dev_attr_security_erase = + __ATTR(erase, 0200, NULL, security_erase_store); + +bool cxl_memdev_has_poison_cmd(struct cxl_memdev *cxlmd, + enum poison_cmd_enabled_bits cmd) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + return test_bit(cmd, mds->poison.enabled_cmds); +} + +static int cxl_get_poison_by_memdev(struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + u64 offset, length; + int rc = 0; + + /* CXL 3.0 Spec 8.2.9.8.4.1 Separate pmem and ram poison requests */ + for (int i = 0; i < cxlds->nr_partitions; i++) { + const struct resource *res = &cxlds->part[i].res; + + offset = res->start; + length = resource_size(res); + rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); + /* + * Invalid Physical Address is not an error for + * volatile addresses. Device support is optional. + */ + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) + rc = 0; + } + return rc; +} + +int cxl_trigger_poison_list(struct cxl_memdev *cxlmd) +{ + struct cxl_port *port; + int rc; + + port = cxlmd->endpoint; + if (!port || !is_cxl_endpoint(port)) + return -EINVAL; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + if (cxl_num_decoders_committed(port) == 0) { + /* No regions mapped to this memdev */ + rc = cxl_get_poison_by_memdev(cxlmd); + } else { + /* Regions mapped, collect poison by endpoint */ + rc = cxl_get_poison_by_endpoint(port); + } + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_trigger_poison_list, "CXL"); + +static int cxl_validate_poison_dpa(struct cxl_memdev *cxlmd, u64 dpa) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + if (!resource_size(&cxlds->dpa_res)) { + dev_dbg(cxlds->dev, "device has no dpa resource\n"); + return -EINVAL; + } + if (!cxl_resource_contains_addr(&cxlds->dpa_res, dpa)) { + dev_dbg(cxlds->dev, "dpa:0x%llx not in resource:%pR\n", + dpa, &cxlds->dpa_res); + return -EINVAL; + } + if (!IS_ALIGNED(dpa, 64)) { + dev_dbg(cxlds->dev, "dpa:0x%llx is not 64-byte aligned\n", dpa); + return -EINVAL; + } + + return 0; +} + +int cxl_inject_poison_locked(struct cxl_memdev *cxlmd, u64 dpa) +{ + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + struct cxl_mbox_inject_poison inject; + struct cxl_poison_record record; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_region *cxlr; + int rc; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + lockdep_assert_held(&cxl_rwsem.dpa); + lockdep_assert_held(&cxl_rwsem.region); + + rc = cxl_validate_poison_dpa(cxlmd, dpa); + if (rc) + return rc; + + inject.address = cpu_to_le64(dpa); + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_INJECT_POISON, + .size_in = sizeof(inject), + .payload_in = &inject, + }; + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + return rc; + + cxlr = cxl_dpa_to_region(cxlmd, dpa); + if (cxlr) + dev_warn_once(cxl_mbox->host, + "poison inject dpa:%#llx region: %s\n", dpa, + dev_name(&cxlr->dev)); + + record = (struct cxl_poison_record) { + .address = cpu_to_le64(dpa), + .length = cpu_to_le32(1), + }; + trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_INJECT); + + return 0; +} + +int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa) +{ + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + return cxl_inject_poison_locked(cxlmd, dpa); +} +EXPORT_SYMBOL_NS_GPL(cxl_inject_poison, "CXL"); + +int cxl_clear_poison_locked(struct cxl_memdev *cxlmd, u64 dpa) +{ + struct cxl_mailbox *cxl_mbox = &cxlmd->cxlds->cxl_mbox; + struct cxl_mbox_clear_poison clear; + struct cxl_poison_record record; + struct cxl_mbox_cmd mbox_cmd; + struct cxl_region *cxlr; + int rc; + + if (!IS_ENABLED(CONFIG_DEBUG_FS)) + return 0; + + lockdep_assert_held(&cxl_rwsem.dpa); + lockdep_assert_held(&cxl_rwsem.region); + + rc = cxl_validate_poison_dpa(cxlmd, dpa); + if (rc) + return rc; + + /* + * In CXL 3.0 Spec 8.2.9.8.4.3, the Clear Poison mailbox command + * is defined to accept 64 bytes of write-data, along with the + * address to clear. This driver uses zeroes as write-data. + */ + clear = (struct cxl_mbox_clear_poison) { + .address = cpu_to_le64(dpa) + }; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_CLEAR_POISON, + .size_in = sizeof(clear), + .payload_in = &clear, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc) + return rc; + + cxlr = cxl_dpa_to_region(cxlmd, dpa); + if (cxlr) + dev_warn_once(cxl_mbox->host, + "poison clear dpa:%#llx region: %s\n", dpa, + dev_name(&cxlr->dev)); + + record = (struct cxl_poison_record) { + .address = cpu_to_le64(dpa), + .length = cpu_to_le32(1), + }; + trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_CLEAR); + + return 0; +} + +int cxl_clear_poison(struct cxl_memdev *cxlmd, u64 dpa) +{ + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + return cxl_clear_poison_locked(cxlmd, dpa); +} +EXPORT_SYMBOL_NS_GPL(cxl_clear_poison, "CXL"); + +static struct attribute *cxl_memdev_attributes[] = { + &dev_attr_serial.attr, + &dev_attr_firmware_version.attr, + &dev_attr_payload_max.attr, + &dev_attr_label_storage_size.attr, + &dev_attr_numa_node.attr, + NULL, +}; + +static struct cxl_dpa_perf *to_pmem_perf(struct cxl_dev_state *cxlds) +{ + for (int i = 0; i < cxlds->nr_partitions; i++) + if (cxlds->part[i].mode == CXL_PARTMODE_PMEM) + return &cxlds->part[i].perf; + return NULL; +} + +static ssize_t pmem_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%d\n", to_pmem_perf(cxlds)->qos_class); +} + +static struct device_attribute dev_attr_pmem_qos_class = + __ATTR(qos_class, 0444, pmem_qos_class_show, NULL); + +static struct attribute *cxl_memdev_pmem_attributes[] = { + &dev_attr_pmem_size.attr, + &dev_attr_pmem_qos_class.attr, + NULL, +}; + +static struct cxl_dpa_perf *to_ram_perf(struct cxl_dev_state *cxlds) +{ + if (cxlds->part[0].mode != CXL_PARTMODE_RAM) + return NULL; + return &cxlds->part[0].perf; +} + +static ssize_t ram_qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%d\n", to_ram_perf(cxlds)->qos_class); +} + +static struct device_attribute dev_attr_ram_qos_class = + __ATTR(qos_class, 0444, ram_qos_class_show, NULL); + +static struct attribute *cxl_memdev_ram_attributes[] = { + &dev_attr_ram_size.attr, + &dev_attr_ram_qos_class.attr, + NULL, +}; + +static struct attribute *cxl_memdev_security_attributes[] = { + &dev_attr_security_state.attr, + &dev_attr_security_sanitize.attr, + &dev_attr_security_erase.attr, + NULL, +}; + +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr) + return 0; + return a->mode; +} + +static struct attribute_group cxl_memdev_attribute_group = { + .attrs = cxl_memdev_attributes, + .is_visible = cxl_memdev_visible, +}; + +static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dpa_perf *perf = to_ram_perf(cxlmd->cxlds); + + if (a == &dev_attr_ram_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; + + return a->mode; +} + +static struct attribute_group cxl_memdev_ram_attribute_group = { + .name = "ram", + .attrs = cxl_memdev_ram_attributes, + .is_visible = cxl_ram_visible, +}; + +static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dpa_perf *perf = to_pmem_perf(cxlmd->cxlds); + + if (a == &dev_attr_pmem_qos_class.attr && + (!perf || perf->qos_class == CXL_QOS_CLASS_INVALID)) + return 0; + + return a->mode; +} + +static struct attribute_group cxl_memdev_pmem_attribute_group = { + .name = "pmem", + .attrs = cxl_memdev_pmem_attributes, + .is_visible = cxl_pmem_visible, +}; + +static umode_t cxl_memdev_security_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + + if (a == &dev_attr_security_sanitize.attr && + !test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds)) + return 0; + + if (a == &dev_attr_security_erase.attr && + !test_bit(CXL_SEC_ENABLED_SECURE_ERASE, mds->security.enabled_cmds)) + return 0; + + return a->mode; +} + +static struct attribute_group cxl_memdev_security_attribute_group = { + .name = "security", + .attrs = cxl_memdev_security_attributes, + .is_visible = cxl_memdev_security_visible, +}; + +static const struct attribute_group *cxl_memdev_attribute_groups[] = { + &cxl_memdev_attribute_group, + &cxl_memdev_ram_attribute_group, + &cxl_memdev_pmem_attribute_group, + &cxl_memdev_security_attribute_group, + NULL, +}; + +void cxl_memdev_update_perf(struct cxl_memdev *cxlmd) +{ + sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_ram_attribute_group); + sysfs_update_group(&cxlmd->dev.kobj, &cxl_memdev_pmem_attribute_group); +} +EXPORT_SYMBOL_NS_GPL(cxl_memdev_update_perf, "CXL"); + +static const struct device_type cxl_memdev_type = { + .name = "cxl_memdev", + .release = cxl_memdev_release, + .devnode = cxl_memdev_devnode, + .groups = cxl_memdev_attribute_groups, +}; + +bool is_cxl_memdev(const struct device *dev) +{ + return dev->type == &cxl_memdev_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL"); + +/** + * set_exclusive_cxl_commands() - atomically disable user cxl commands + * @mds: The device state to operate on + * @cmds: bitmap of commands to mark exclusive + * + * Grab the cxl_memdev_rwsem in write mode to flush in-flight + * invocations of the ioctl path and then disable future execution of + * commands with the command ids set in @cmds. + */ +void set_exclusive_cxl_commands(struct cxl_memdev_state *mds, + unsigned long *cmds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + + guard(rwsem_write)(&cxl_memdev_rwsem); + bitmap_or(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds, + cmds, CXL_MEM_COMMAND_ID_MAX); +} +EXPORT_SYMBOL_NS_GPL(set_exclusive_cxl_commands, "CXL"); + +/** + * clear_exclusive_cxl_commands() - atomically enable user cxl commands + * @mds: The device state to modify + * @cmds: bitmap of commands to mark available for userspace + */ +void clear_exclusive_cxl_commands(struct cxl_memdev_state *mds, + unsigned long *cmds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + + guard(rwsem_write)(&cxl_memdev_rwsem); + bitmap_andnot(cxl_mbox->exclusive_cmds, cxl_mbox->exclusive_cmds, + cmds, CXL_MEM_COMMAND_ID_MAX); +} +EXPORT_SYMBOL_NS_GPL(clear_exclusive_cxl_commands, "CXL"); + +static void cxl_memdev_shutdown(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + + guard(rwsem_write)(&cxl_memdev_rwsem); + cxlmd->cxlds = NULL; +} + +static void cxl_memdev_unregister(void *_cxlmd) +{ + struct cxl_memdev *cxlmd = _cxlmd; + struct device *dev = &cxlmd->dev; + + cdev_device_del(&cxlmd->cdev, dev); + cxl_memdev_shutdown(dev); + put_device(dev); +} + +static void detach_memdev(struct work_struct *work) +{ + struct cxl_memdev *cxlmd; + + cxlmd = container_of(work, typeof(*cxlmd), detach_work); + device_release_driver(&cxlmd->dev); + put_device(&cxlmd->dev); +} + +static struct lock_class_key cxl_memdev_key; + +static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, + const struct file_operations *fops) +{ + struct cxl_memdev *cxlmd; + struct device *dev; + struct cdev *cdev; + int rc; + + cxlmd = kzalloc(sizeof(*cxlmd), GFP_KERNEL); + if (!cxlmd) + return ERR_PTR(-ENOMEM); + + rc = ida_alloc_max(&cxl_memdev_ida, CXL_MEM_MAX_DEVS - 1, GFP_KERNEL); + if (rc < 0) + goto err; + cxlmd->id = rc; + cxlmd->depth = -1; + + dev = &cxlmd->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_memdev_key); + dev->parent = cxlds->dev; + dev->bus = &cxl_bus_type; + dev->devt = MKDEV(cxl_mem_major, cxlmd->id); + dev->type = &cxl_memdev_type; + device_set_pm_not_required(dev); + INIT_WORK(&cxlmd->detach_work, detach_memdev); + + cdev = &cxlmd->cdev; + cdev_init(cdev, fops); + return cxlmd; + +err: + kfree(cxlmd); + return ERR_PTR(rc); +} + +static long __cxl_memdev_ioctl(struct cxl_memdev *cxlmd, unsigned int cmd, + unsigned long arg) +{ + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds); + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + + switch (cmd) { + case CXL_MEM_QUERY_COMMANDS: + return cxl_query_cmd(cxl_mbox, (void __user *)arg); + case CXL_MEM_SEND_COMMAND: + return cxl_send_cmd(cxl_mbox, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static long cxl_memdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cxl_memdev *cxlmd = file->private_data; + struct cxl_dev_state *cxlds; + + guard(rwsem_read)(&cxl_memdev_rwsem); + cxlds = cxlmd->cxlds; + if (cxlds && cxlds->type == CXL_DEVTYPE_CLASSMEM) + return __cxl_memdev_ioctl(cxlmd, cmd, arg); + + return -ENXIO; +} + +static int cxl_memdev_open(struct inode *inode, struct file *file) +{ + struct cxl_memdev *cxlmd = + container_of(inode->i_cdev, typeof(*cxlmd), cdev); + + get_device(&cxlmd->dev); + file->private_data = cxlmd; + + return 0; +} + +static int cxl_memdev_release_file(struct inode *inode, struct file *file) +{ + struct cxl_memdev *cxlmd = + container_of(inode->i_cdev, typeof(*cxlmd), cdev); + + put_device(&cxlmd->dev); + + return 0; +} + +/** + * cxl_mem_get_fw_info - Get Firmware info + * @mds: The device data for the operation + * + * Retrieve firmware info for the device specified. + * + * Return: 0 if no error: or the result of the mailbox command. + * + * See CXL-3.0 8.2.9.3.1 Get FW Info + */ +static int cxl_mem_get_fw_info(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_get_fw_info info; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_GET_FW_INFO, + .size_out = sizeof(info), + .payload_out = &info, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) + return rc; + + mds->fw.num_slots = info.num_slots; + mds->fw.cur_slot = FIELD_GET(CXL_FW_INFO_SLOT_INFO_CUR_MASK, + info.slot_info); + + return 0; +} + +/** + * cxl_mem_activate_fw - Activate Firmware + * @mds: The device data for the operation + * @slot: slot number to activate + * + * Activate firmware in a given slot for the device specified. + * + * Return: 0 if no error: or the result of the mailbox command. + * + * See CXL-3.0 8.2.9.3.3 Activate FW + */ +static int cxl_mem_activate_fw(struct cxl_memdev_state *mds, int slot) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_activate_fw activate; + struct cxl_mbox_cmd mbox_cmd; + + if (slot == 0 || slot > mds->fw.num_slots) + return -EINVAL; + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_ACTIVATE_FW, + .size_in = sizeof(activate), + .payload_in = &activate, + }; + + /* Only offline activation supported for now */ + activate.action = CXL_FW_ACTIVATE_OFFLINE; + activate.slot = slot; + + return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); +} + +/** + * cxl_mem_abort_fw_xfer - Abort an in-progress FW transfer + * @mds: The device data for the operation + * + * Abort an in-progress firmware transfer for the device specified. + * + * Return: 0 if no error: or the result of the mailbox command. + * + * See CXL-3.0 8.2.9.3.2 Transfer FW + */ +static int cxl_mem_abort_fw_xfer(struct cxl_memdev_state *mds) +{ + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct cxl_mbox_transfer_fw *transfer; + struct cxl_mbox_cmd mbox_cmd; + int rc; + + transfer = kzalloc(struct_size(transfer, data, 0), GFP_KERNEL); + if (!transfer) + return -ENOMEM; + + /* Set a 1s poll interval and a total wait time of 30s */ + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_TRANSFER_FW, + .size_in = sizeof(*transfer), + .payload_in = transfer, + .poll_interval_ms = 1000, + .poll_count = 30, + }; + + transfer->action = CXL_FW_TRANSFER_ACTION_ABORT; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + kfree(transfer); + return rc; +} + +static void cxl_fw_cleanup(struct fw_upload *fwl) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + + mds->fw.next_slot = 0; +} + +static int cxl_fw_do_cancel(struct fw_upload *fwl) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + struct cxl_dev_state *cxlds = &mds->cxlds; + struct cxl_memdev *cxlmd = cxlds->cxlmd; + int rc; + + rc = cxl_mem_abort_fw_xfer(mds); + if (rc < 0) + dev_err(&cxlmd->dev, "Error aborting FW transfer: %d\n", rc); + + return FW_UPLOAD_ERR_CANCELED; +} + +static enum fw_upload_err cxl_fw_prepare(struct fw_upload *fwl, const u8 *data, + u32 size) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + struct cxl_mbox_transfer_fw *transfer; + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + + if (!size) + return FW_UPLOAD_ERR_INVALID_SIZE; + + mds->fw.oneshot = struct_size(transfer, data, size) < + cxl_mbox->payload_size; + + if (cxl_mem_get_fw_info(mds)) + return FW_UPLOAD_ERR_HW_ERROR; + + /* + * So far no state has been changed, hence no other cleanup is + * necessary. Simply return the cancelled status. + */ + if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state)) + return FW_UPLOAD_ERR_CANCELED; + + return FW_UPLOAD_ERR_NONE; +} + +static enum fw_upload_err cxl_fw_write(struct fw_upload *fwl, const u8 *data, + u32 offset, u32 size, u32 *written) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + struct cxl_dev_state *cxlds = &mds->cxlds; + struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct cxl_mbox_transfer_fw *transfer; + struct cxl_mbox_cmd mbox_cmd; + u32 cur_size, remaining; + size_t size_in; + int rc; + + *written = 0; + + /* Offset has to be aligned to 128B (CXL-3.0 8.2.9.3.2 Table 8-57) */ + if (!IS_ALIGNED(offset, CXL_FW_TRANSFER_ALIGNMENT)) { + dev_err(&cxlmd->dev, + "misaligned offset for FW transfer slice (%u)\n", + offset); + return FW_UPLOAD_ERR_RW_ERROR; + } + + /* + * Pick transfer size based on mds->payload_size @size must bw 128-byte + * aligned, ->payload_size is a power of 2 starting at 256 bytes, and + * sizeof(*transfer) is 128. These constraints imply that @cur_size + * will always be 128b aligned. + */ + cur_size = min_t(size_t, size, cxl_mbox->payload_size - sizeof(*transfer)); + + remaining = size - cur_size; + size_in = struct_size(transfer, data, cur_size); + + if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state)) + return cxl_fw_do_cancel(fwl); + + /* + * Slot numbers are 1-indexed + * cur_slot is the 0-indexed next_slot (i.e. 'cur_slot - 1 + 1') + * Check for rollover using modulo, and 1-index it by adding 1 + */ + mds->fw.next_slot = (mds->fw.cur_slot % mds->fw.num_slots) + 1; + + /* Do the transfer via mailbox cmd */ + transfer = kzalloc(size_in, GFP_KERNEL); + if (!transfer) + return FW_UPLOAD_ERR_RW_ERROR; + + transfer->offset = cpu_to_le32(offset / CXL_FW_TRANSFER_ALIGNMENT); + memcpy(transfer->data, data + offset, cur_size); + if (mds->fw.oneshot) { + transfer->action = CXL_FW_TRANSFER_ACTION_FULL; + transfer->slot = mds->fw.next_slot; + } else { + if (offset == 0) { + transfer->action = CXL_FW_TRANSFER_ACTION_INITIATE; + } else if (remaining == 0) { + transfer->action = CXL_FW_TRANSFER_ACTION_END; + transfer->slot = mds->fw.next_slot; + } else { + transfer->action = CXL_FW_TRANSFER_ACTION_CONTINUE; + } + } + + mbox_cmd = (struct cxl_mbox_cmd) { + .opcode = CXL_MBOX_OP_TRANSFER_FW, + .size_in = size_in, + .payload_in = transfer, + .poll_interval_ms = 1000, + .poll_count = 30, + }; + + rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd); + if (rc < 0) { + rc = FW_UPLOAD_ERR_RW_ERROR; + goto out_free; + } + + *written = cur_size; + + /* Activate FW if oneshot or if the last slice was written */ + if (mds->fw.oneshot || remaining == 0) { + dev_dbg(&cxlmd->dev, "Activating firmware slot: %d\n", + mds->fw.next_slot); + rc = cxl_mem_activate_fw(mds, mds->fw.next_slot); + if (rc < 0) { + dev_err(&cxlmd->dev, "Error activating firmware: %d\n", + rc); + rc = FW_UPLOAD_ERR_HW_ERROR; + goto out_free; + } + } + + rc = FW_UPLOAD_ERR_NONE; + +out_free: + kfree(transfer); + return rc; +} + +static enum fw_upload_err cxl_fw_poll_complete(struct fw_upload *fwl) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + + /* + * cxl_internal_send_cmd() handles background operations synchronously. + * No need to wait for completions here - any errors would've been + * reported and handled during the ->write() call(s). + * Just check if a cancel request was received, and return success. + */ + if (test_and_clear_bit(CXL_FW_CANCEL, mds->fw.state)) + return cxl_fw_do_cancel(fwl); + + return FW_UPLOAD_ERR_NONE; +} + +static void cxl_fw_cancel(struct fw_upload *fwl) +{ + struct cxl_memdev_state *mds = fwl->dd_handle; + + set_bit(CXL_FW_CANCEL, mds->fw.state); +} + +static const struct fw_upload_ops cxl_memdev_fw_ops = { + .prepare = cxl_fw_prepare, + .write = cxl_fw_write, + .poll_complete = cxl_fw_poll_complete, + .cancel = cxl_fw_cancel, + .cleanup = cxl_fw_cleanup, +}; + +static void cxl_remove_fw_upload(void *fwl) +{ + firmware_upload_unregister(fwl); +} + +int devm_cxl_setup_fw_upload(struct device *host, struct cxl_memdev_state *mds) +{ + struct cxl_dev_state *cxlds = &mds->cxlds; + struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox; + struct device *dev = &cxlds->cxlmd->dev; + struct fw_upload *fwl; + + if (!test_bit(CXL_MEM_COMMAND_ID_GET_FW_INFO, cxl_mbox->enabled_cmds)) + return 0; + + fwl = firmware_upload_register(THIS_MODULE, dev, dev_name(dev), + &cxl_memdev_fw_ops, mds); + if (IS_ERR(fwl)) + return PTR_ERR(fwl); + return devm_add_action_or_reset(host, cxl_remove_fw_upload, fwl); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fw_upload, "CXL"); + +static const struct file_operations cxl_memdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = cxl_memdev_ioctl, + .open = cxl_memdev_open, + .release = cxl_memdev_release_file, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +struct cxl_memdev *devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds) +{ + struct cxl_memdev *cxlmd; + struct device *dev; + struct cdev *cdev; + int rc; + + cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + if (IS_ERR(cxlmd)) + return cxlmd; + + dev = &cxlmd->dev; + rc = dev_set_name(dev, "mem%d", cxlmd->id); + if (rc) + goto err; + + /* + * Activate ioctl operations, no cxl_memdev_rwsem manipulation + * needed as this is ordered with cdev_add() publishing the device. + */ + cxlmd->cxlds = cxlds; + cxlds->cxlmd = cxlmd; + + cdev = &cxlmd->cdev; + rc = cdev_device_add(cdev, dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd); + if (rc) + return ERR_PTR(rc); + return cxlmd; + +err: + /* + * The cdev was briefly live, shutdown any ioctl operations that + * saw that state. + */ + cxl_memdev_shutdown(dev); + put_device(dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); + +static void sanitize_teardown_notifier(void *data) +{ + struct cxl_memdev_state *mds = data; + struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox; + struct kernfs_node *state; + + /* + * Prevent new irq triggered invocations of the workqueue and + * flush inflight invocations. + */ + mutex_lock(&cxl_mbox->mbox_mutex); + state = mds->security.sanitize_node; + mds->security.sanitize_node = NULL; + mutex_unlock(&cxl_mbox->mbox_mutex); + + cancel_delayed_work_sync(&mds->security.poll_dwork); + sysfs_put(state); +} + +int devm_cxl_sanitize_setup_notifier(struct device *host, + struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); + struct kernfs_node *sec; + + if (!test_bit(CXL_SEC_ENABLED_SANITIZE, mds->security.enabled_cmds)) + return 0; + + /* + * Note, the expectation is that @cxlmd would have failed to be + * created if these sysfs_get_dirent calls fail. + */ + sec = sysfs_get_dirent(cxlmd->dev.kobj.sd, "security"); + if (!sec) + return -ENOENT; + mds->security.sanitize_node = sysfs_get_dirent(sec, "state"); + sysfs_put(sec); + if (!mds->security.sanitize_node) + return -ENOENT; + + return devm_add_action_or_reset(host, sanitize_teardown_notifier, mds); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_sanitize_setup_notifier, "CXL"); + +__init int cxl_memdev_init(void) +{ + dev_t devt; + int rc; + + rc = alloc_chrdev_region(&devt, 0, CXL_MEM_MAX_DEVS, "cxl"); + if (rc) + return rc; + + cxl_mem_major = MAJOR(devt); + + return 0; +} + +void cxl_memdev_exit(void) +{ + unregister_chrdev_region(MKDEV(cxl_mem_major, 0), CXL_MEM_MAX_DEVS); +} diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c new file mode 100644 index 000000000000..5b023a0178a4 --- /dev/null +++ b/drivers/cxl/core/pci.c @@ -0,0 +1,1189 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include <linux/units.h> +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/pci-doe.h> +#include <linux/aer.h> +#include <cxlpci.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" +#include "trace.h" + +/** + * DOC: cxl core pci + * + * Compute Express Link protocols are layered on top of PCIe. CXL core provides + * a set of helpers for CXL interactions which occur via PCIe. + */ + +static unsigned short media_ready_timeout = 60; +module_param(media_ready_timeout, ushort, 0644); +MODULE_PARM_DESC(media_ready_timeout, "seconds to wait for media ready"); + +static int pci_get_port_num(struct pci_dev *pdev) +{ + u32 lnkcap; + int type; + + type = pci_pcie_type(pdev); + if (type != PCI_EXP_TYPE_DOWNSTREAM && type != PCI_EXP_TYPE_ROOT_PORT) + return -EINVAL; + + if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, + &lnkcap)) + return -ENXIO; + + return FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); +} + +/** + * __devm_cxl_add_dport_by_dev - allocate a dport by dport device + * @port: cxl_port that hosts the dport + * @dport_dev: 'struct device' of the dport + * + * Returns the allocated dport on success or ERR_PTR() of -errno on error + */ +struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_register_map map; + struct pci_dev *pdev; + int port_num, rc; + + if (!dev_is_pci(dport_dev)) + return ERR_PTR(-EINVAL); + + pdev = to_pci_dev(dport_dev); + port_num = pci_get_port_num(pdev); + if (port_num < 0) + return ERR_PTR(port_num); + + rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + return ERR_PTR(rc); + + device_lock_assert(&port->dev); + return devm_cxl_add_dport(port, dport_dev, port_num, map.resource); +} +EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL"); + +static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool valid = false; + int rc, i; + u32 temp; + + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM INFO VALID bit first, give up after 1s */ + i = 1; + do { + rc = pci_read_config_dword(pdev, + d + CXL_DVSEC_RANGE_SIZE_LOW(id), + &temp); + if (rc) + return rc; + + valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + if (valid) + break; + msleep(1000); + } while (i--); + + if (!valid) { + dev_err(&pdev->dev, + "Timeout awaiting memory range %d valid after 1s.\n", + id); + return -ETIMEDOUT; + } + + return 0; +} + +static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool active = false; + int rc, i; + u32 temp; + + if (id > CXL_DVSEC_RANGE_MAX) + return -EINVAL; + + /* Check MEM ACTIVE bit, up to 60s timeout by default */ + for (i = media_ready_timeout; i; i--) { + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); + if (rc) + return rc; + + active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + if (active) + break; + msleep(1000); + } + + if (!active) { + dev_err(&pdev->dev, + "timeout awaiting memory active after %d seconds\n", + media_ready_timeout); + return -ETIMEDOUT; + } + + return 0; +} + +/* + * Wait up to @media_ready_timeout for the device to report memory + * active. + */ +int cxl_await_media_ready(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + int rc, i, hdm_count; + u64 md_status; + u16 cap; + + rc = pci_read_config_word(pdev, + d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_valid(cxlds, i); + if (rc) + return rc; + } + + for (i = 0; i < hdm_count; i++) { + rc = cxl_dvsec_mem_range_active(cxlds, i); + if (rc) + return rc; + } + + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + if (!CXLMDEV_READY(md_status)) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_await_media_ready, "CXL"); + +static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + u16 ctrl; + int rc; + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc < 0) + return rc; + + if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val) + return 1; + ctrl &= ~CXL_DVSEC_MEM_ENABLE; + ctrl |= val; + + rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl); + if (rc < 0) + return rc; + + return 0; +} + +static void clear_mem_enable(void *cxlds) +{ + cxl_set_mem_enable(cxlds, 0); +} + +static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds) +{ + int rc; + + rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + return devm_add_action_or_reset(host, clear_mem_enable, cxlds); +} + +/* require dvsec ranges to be covered by a locked platform window */ +static int dvsec_range_allowed(struct device *dev, const void *arg) +{ + const struct range *dev_range = arg; + struct cxl_decoder *cxld; + + if (!is_root_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + + if (!(cxld->flags & CXL_DECODER_F_RAM)) + return 0; + + return range_contains(&cxld->hpa_range, dev_range); +} + +static void disable_hdm(void *_cxlhdm) +{ + u32 global_ctrl; + struct cxl_hdm *cxlhdm = _cxlhdm; + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + writel(global_ctrl & ~CXL_HDM_DECODER_ENABLE, + hdm + CXL_HDM_DECODER_CTRL_OFFSET); +} + +static int devm_cxl_enable_hdm(struct device *host, struct cxl_hdm *cxlhdm) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + u32 global_ctrl; + + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + writel(global_ctrl | CXL_HDM_DECODER_ENABLE, + hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + return devm_add_action_or_reset(host, disable_hdm, cxlhdm); +} + +int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, + struct cxl_endpoint_dvsec_info *info) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct device *dev = cxlds->dev; + int hdm_count, rc, i, ranges = 0; + int d = cxlds->cxl_dvsec; + u16 cap, ctrl; + + if (!d) { + dev_dbg(dev, "No DVSEC Capability\n"); + return -ENXIO; + } + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { + dev_dbg(dev, "Not MEM Capable\n"); + return -ENXIO; + } + + /* + * It is not allowed by spec for MEM.capable to be set and have 0 legacy + * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this + * driver is for a spec defined class code which must be CXL.mem + * capable, there is no point in continuing to enable CXL.mem. + */ + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + if (!hdm_count || hdm_count > 2) + return -EINVAL; + + /* + * The current DVSEC values are moot if the memory capability is + * disabled, and they will remain moot after the HDM Decoder + * capability is enabled. + */ + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + + info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + if (!info->mem_enabled) + return 0; + + for (i = 0; i < hdm_count; i++) { + u64 base, size; + u32 temp; + + rc = cxl_dvsec_mem_range_valid(cxlds, i); + if (rc) + return rc; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + if (rc) + return rc; + + size = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + if (rc) + return rc; + + size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + if (!size) { + continue; + } + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + if (rc) + return rc; + + base = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + if (rc) + return rc; + + base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + + info->dvsec_range[ranges++] = (struct range) { + .start = base, + .end = base + size - 1 + }; + } + + info->ranges = ranges; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dvsec_rr_decode, "CXL"); + +/** + * cxl_hdm_decode_init() - Setup HDM decoding for the endpoint + * @cxlds: Device state + * @cxlhdm: Mapped HDM decoder Capability + * @info: Cached DVSEC range registers info + * + * Try to enable the endpoint's HDM Decoder Capability + */ +int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm, + struct cxl_endpoint_dvsec_info *info) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + struct cxl_port *port = cxlhdm->port; + struct device *dev = cxlds->dev; + struct cxl_port *root; + int i, rc, allowed; + u32 global_ctrl = 0; + + if (hdm) + global_ctrl = readl(hdm + CXL_HDM_DECODER_CTRL_OFFSET); + + /* + * If the HDM Decoder Capability is already enabled then assume + * that some other agent like platform firmware set it up. + */ + if (global_ctrl & CXL_HDM_DECODER_ENABLE || (!hdm && info->mem_enabled)) + return devm_cxl_enable_mem(&port->dev, cxlds); + + /* + * If the HDM Decoder Capability does not exist and DVSEC was + * not setup, the DVSEC based emulation cannot be used. + */ + if (!hdm) + return -ENODEV; + + /* The HDM Decoder Capability exists but is globally disabled. */ + + /* + * If the DVSEC CXL Range registers are not enabled, just + * enable and use the HDM Decoder Capability registers. + */ + if (!info->mem_enabled) { + rc = devm_cxl_enable_hdm(&port->dev, cxlhdm); + if (rc) + return rc; + + return devm_cxl_enable_mem(&port->dev, cxlds); + } + + /* + * Per CXL 2.0 Section 8.1.3.8.3 and 8.1.3.8.4 DVSEC CXL Range 1 Base + * [High,Low] when HDM operation is enabled the range register values + * are ignored by the device, but the spec also recommends matching the + * DVSEC Range 1,2 to HDM Decoder Range 0,1. So, non-zero info->ranges + * are expected even though Linux does not require or maintain that + * match. Check if at least one DVSEC range is enabled and allowed by + * the platform. That is, the DVSEC range must be covered by a locked + * platform window (CFMWS). Fail otherwise as the endpoint's decoders + * cannot be used. + */ + + root = to_cxl_port(port->dev.parent); + while (!is_cxl_root(root) && is_cxl_port(root->dev.parent)) + root = to_cxl_port(root->dev.parent); + if (!is_cxl_root(root)) { + dev_err(dev, "Failed to acquire root port for HDM enable\n"); + return -ENODEV; + } + + for (i = 0, allowed = 0; i < info->ranges; i++) { + struct device *cxld_dev; + + cxld_dev = device_find_child(&root->dev, &info->dvsec_range[i], + dvsec_range_allowed); + if (!cxld_dev) { + dev_dbg(dev, "DVSEC Range%d denied by platform\n", i); + continue; + } + dev_dbg(dev, "DVSEC Range%d allowed by platform\n", i); + put_device(cxld_dev); + allowed++; + } + + if (!allowed) { + dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n"); + return -ENXIO; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, "CXL"); + +#define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff +#define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA 0 +#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE 0xffff0000 +#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY 0xffff +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2 + +#define CDAT_DOE_REQ(entry_handle) cpu_to_le32 \ + (FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE, \ + CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE, \ + CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle))) + +static int cxl_cdat_get_length(struct device *dev, + struct pci_doe_mb *doe_mb, + size_t *length) +{ + __le32 request = CDAT_DOE_REQ(0); + __le32 response[2]; + int rc; + + rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS, + &request, sizeof(request), + &response, sizeof(response)); + if (rc < 0) { + dev_err(dev, "DOE failed: %d", rc); + return rc; + } + if (rc < sizeof(response)) + return -EIO; + + *length = le32_to_cpu(response[1]); + dev_dbg(dev, "CDAT length %zu\n", *length); + + return 0; +} + +static int cxl_cdat_read_table(struct device *dev, + struct pci_doe_mb *doe_mb, + struct cdat_doe_rsp *rsp, size_t *length) +{ + size_t received, remaining = *length; + unsigned int entry_handle = 0; + union cdat_data *data; + __le32 saved_dw = 0; + + do { + __le32 request = CDAT_DOE_REQ(entry_handle); + int rc; + + rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS, + &request, sizeof(request), + rsp, sizeof(*rsp) + remaining); + if (rc < 0) { + dev_err(dev, "DOE failed: %d", rc); + return rc; + } + + if (rc < sizeof(*rsp)) + return -EIO; + + data = (union cdat_data *)rsp->data; + received = rc - sizeof(*rsp); + + if (entry_handle == 0) { + if (received != sizeof(data->header)) + return -EIO; + } else { + if (received < sizeof(data->entry) || + received != le16_to_cpu(data->entry.length)) + return -EIO; + } + + /* Get the CXL table access header entry handle */ + entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, + le32_to_cpu(rsp->doe_header)); + + /* + * Table Access Response Header overwrote the last DW of + * previous entry, so restore that DW + */ + rsp->doe_header = saved_dw; + remaining -= received; + rsp = (void *)rsp + received; + saved_dw = rsp->doe_header; + } while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY); + + /* Length in CDAT header may exceed concatenation of CDAT entries */ + *length -= remaining; + + return 0; +} + +static unsigned char cdat_checksum(void *buf, size_t size) +{ + unsigned char sum, *data = buf; + size_t i; + + for (sum = 0, i = 0; i < size; i++) + sum += data[i]; + return sum; +} + +/** + * read_cdat_data - Read the CDAT data on this port + * @port: Port to read data from + * + * This call will sleep waiting for responses from the DOE mailbox. + */ +void read_cdat_data(struct cxl_port *port) +{ + struct device *uport = port->uport_dev; + struct device *dev = &port->dev; + struct pci_doe_mb *doe_mb; + struct pci_dev *pdev = NULL; + struct cxl_memdev *cxlmd; + struct cdat_doe_rsp *buf; + size_t table_length, length; + int rc; + + if (is_cxl_memdev(uport)) { + struct device *host; + + cxlmd = to_cxl_memdev(uport); + host = cxlmd->dev.parent; + if (dev_is_pci(host)) + pdev = to_pci_dev(host); + } else if (dev_is_pci(uport)) { + pdev = to_pci_dev(uport); + } + + if (!pdev) + return; + + doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS); + if (!doe_mb) { + dev_dbg(dev, "No CDAT mailbox\n"); + return; + } + + port->cdat_available = true; + + if (cxl_cdat_get_length(dev, doe_mb, &length)) { + dev_dbg(dev, "No CDAT length\n"); + return; + } + + /* + * The begin of the CDAT buffer needs space for additional 4 + * bytes for the DOE header. Table data starts afterwards. + */ + buf = devm_kzalloc(dev, sizeof(*buf) + length, GFP_KERNEL); + if (!buf) + goto err; + + table_length = length; + + rc = cxl_cdat_read_table(dev, doe_mb, buf, &length); + if (rc) + goto err; + + if (table_length != length) + dev_warn(dev, "Malformed CDAT table length (%zu:%zu), discarding trailing data\n", + table_length, length); + + if (cdat_checksum(buf->data, length)) + goto err; + + port->cdat.table = buf->data; + port->cdat.length = length; + + return; +err: + /* Don't leave table data allocated on error */ + devm_kfree(dev, buf); + dev_err(dev, "Failed to read/validate CDAT.\n"); +} +EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); + +static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) +{ + void __iomem *addr; + u32 status; + + if (!ras_base) + return; + + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } +} + +static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) +{ + return __cxl_handle_cor_ras(cxlds, cxlds->regs.ras); +} + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(void __iomem *ras_base, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +static bool __cxl_handle_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) +{ + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!ras_base) + return false; + + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + void __iomem *rcc_addr = + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); + } else { + fe = status; + } + + header_log_copy(ras_base, hl); + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) +{ + return __cxl_handle_ras(cxlds, cxlds->regs.ras); +} + +#ifdef CONFIG_PCIEAER_CXL + +static void cxl_dport_map_rch_aer(struct cxl_dport *dport) +{ + resource_size_t aer_phys; + struct device *host; + u16 aer_cap; + + aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); + if (aer_cap) { + host = dport->reg_map.host; + aer_phys = aer_cap + dport->rcrb.base; + dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys, + sizeof(struct aer_capability_regs)); + } +} + +static void cxl_dport_map_ras(struct cxl_dport *dport) +{ + struct cxl_register_map *map = &dport->reg_map; + struct device *dev = dport->dport_dev; + + if (!map->component_map.ras.valid) + dev_dbg(dev, "RAS registers not found\n"); + else if (cxl_map_component_regs(map, &dport->regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(dev, "Failed to map RAS capability.\n"); +} + +static void cxl_disable_rch_root_ints(struct cxl_dport *dport) +{ + void __iomem *aer_base = dport->regs.dport_aer; + u32 aer_cmd_mask, aer_cmd; + + if (!aer_base) + return; + + /* + * Disable RCH root port command interrupts. + * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors + * + * This sequence may not be necessary. CXL spec states disabling + * the root cmd register's interrupts is required. But, PCI spec + * shows these are disabled by default on reset. + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +{ + dport->reg_map.host = host; + cxl_dport_map_ras(dport); + + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); + + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *dport) +{ + return __cxl_handle_cor_ras(cxlds, dport->regs.ras); +} + +static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *dport) +{ + return __cxl_handle_ras(cxlds, dport->regs.ras); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + int severity; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + + if (severity == AER_CORRECTABLE) + cxl_handle_rdport_cor_ras(cxlds, dport); + else + cxl_handle_rdport_ras(cxlds, dport); +} + +#else +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_endpoint_cor_ras(cxlds); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_endpoint_ras(cxlds); + } + + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); + +static int cxl_flit_size(struct pci_dev *pdev) +{ + if (cxl_pci_flit_256(pdev)) + return 256; + + return 68; +} + +/** + * cxl_pci_get_latency - calculate the link latency for the PCIe link + * @pdev: PCI device + * + * return: calculated latency or 0 for no latency + * + * CXL Memory Device SW Guide v1.0 2.11.4 Link latency calculation + * Link latency = LinkPropagationLatency + FlitLatency + RetimerLatency + * LinkProgationLatency is negligible, so 0 will be used + * RetimerLatency is assumed to be negligible and 0 will be used + * FlitLatency = FlitSize / LinkBandwidth + * FlitSize is defined by spec. CXL rev3.0 4.2.1. + * 68B flit is used up to 32GT/s. >32GT/s, 256B flit size is used. + * The FlitLatency is converted to picoseconds. + */ +long cxl_pci_get_latency(struct pci_dev *pdev) +{ + long bw; + + bw = pcie_link_speed_mbps(pdev); + if (bw < 0) + return 0; + bw /= BITS_PER_BYTE; + + return cxl_flit_size(pdev) * MEGA / bw; +} + +static int __cxl_endpoint_decoder_reset_detected(struct device *dev, void *data) +{ + struct cxl_port *port = data; + struct cxl_decoder *cxld; + struct cxl_hdm *cxlhdm; + void __iomem *hdm; + u32 ctrl; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return 0; + + cxlhdm = dev_get_drvdata(&port->dev); + hdm = cxlhdm->regs.hdm_decoder; + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); + + return !FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl); +} + +bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port) +{ + return device_for_each_child(&port->dev, port, + __cxl_endpoint_decoder_reset_detected); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_reset_detected, "CXL"); + +int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) +{ + int speed, bw; + u16 lnksta; + u32 width; + + speed = pcie_link_speed_mbps(pdev); + if (speed < 0) + return speed; + speed /= BITS_PER_BYTE; + + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &lnksta); + width = FIELD_GET(PCI_EXP_LNKSTA_NLW, lnksta); + bw = speed * width; + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].read_bandwidth = bw; + c[i].write_bandwidth = bw; + } + + return 0; +} + +/* + * Set max timeout such that platforms will optimize GPF flow to avoid + * the implied worst-case scenario delays. On a sane platform, all + * devices should always complete GPF within the energy budget of + * the GPF flow. The kernel does not have enough information to pick + * anything better than "maximize timeouts and hope it works". + * + * A misbehaving device could block forward progress of GPF for all + * the other devices, exhausting the energy budget of the platform. + * However, the spec seems to assume that moving on from slow to respond + * devices is a virtue. It is not possible to know that, in actuality, + * the slow to respond device is *the* most critical device in the + * system to wait. + */ +#define GPF_TIMEOUT_BASE_MAX 2 +#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */ + +u16 cxl_gpf_get_dvsec(struct device *dev) +{ + struct pci_dev *pdev; + bool is_port = true; + u16 dvsec; + + if (!dev_is_pci(dev)) + return 0; + + pdev = to_pci_dev(dev); + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) + is_port = false; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + if (!dvsec) + dev_warn(dev, "%s GPF DVSEC not present\n", + is_port ? "Port" : "Device"); + return dvsec; +} +EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL"); + +static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) +{ + u64 base, scale; + int rc, offset; + u16 ctrl; + + switch (phase) { + case 1: + offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + break; + case 2: + offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; + base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; + scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + break; + default: + return -EINVAL; + } + + rc = pci_read_config_word(pdev, dvsec + offset, &ctrl); + if (rc) + return rc; + + if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX && + FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX) + return 0; + + ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX); + ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX); + + rc = pci_write_config_word(pdev, dvsec + offset, ctrl); + if (!rc) + pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n", + phase, GPF_TIMEOUT_BASE_MAX); + + return rc; +} + +int cxl_gpf_port_setup(struct cxl_dport *dport) +{ + if (!dport) + return -EINVAL; + + if (!dport->gpf_dvsec) { + struct pci_dev *pdev; + int dvsec; + + dvsec = cxl_gpf_get_dvsec(dport->dport_dev); + if (!dvsec) + return -EINVAL; + + dport->gpf_dvsec = dvsec; + pdev = to_pci_dev(dport->dport_dev); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 1); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 2); + } + + return 0; +} + +struct cxl_walk_context { + struct pci_bus *bus; + struct cxl_port *port; + int type; + int error; + int count; +}; + +static int count_dports(struct pci_dev *pdev, void *data) +{ + struct cxl_walk_context *ctx = data; + int type = pci_pcie_type(pdev); + + if (pdev->bus != ctx->bus) + return 0; + if (!pci_is_pcie(pdev)) + return 0; + if (type != ctx->type) + return 0; + + ctx->count++; + return 0; +} + +int cxl_port_get_possible_dports(struct cxl_port *port) +{ + struct pci_bus *bus = cxl_port_to_pci_bus(port); + struct cxl_walk_context ctx; + int type; + + if (!bus) { + dev_err(&port->dev, "No PCI bus found for port %s\n", + dev_name(&port->dev)); + return -ENXIO; + } + + if (pci_is_root_bus(bus)) + type = PCI_EXP_TYPE_ROOT_PORT; + else + type = PCI_EXP_TYPE_DOWNSTREAM; + + ctx = (struct cxl_walk_context) { + .bus = bus, + .type = type, + }; + pci_walk_bus(bus, count_dports, &ctx); + + return ctx.count; +} diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c new file mode 100644 index 000000000000..8853415c106a --- /dev/null +++ b/drivers/cxl/core/pmem.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl pmem + * + * The core CXL PMEM infrastructure supports persistent memory + * provisioning and serves as a bridge to the LIBNVDIMM subsystem. A CXL + * 'bridge' device is added at the root of a CXL device topology if + * platform firmware advertises at least one persistent memory capable + * CXL window. That root-level bridge corresponds to a LIBNVDIMM 'bus' + * device. Then for each cxl_memdev in the CXL device topology a bridge + * device is added to host a LIBNVDIMM dimm object. When these bridges + * are registered native LIBNVDIMM uapis are translated to CXL + * operations, for example, namespace label access commands. + */ + +static DEFINE_IDA(cxl_nvdimm_bridge_ida); + +static void cxl_nvdimm_bridge_release(struct device *dev) +{ + struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); + + ida_free(&cxl_nvdimm_bridge_ida, cxl_nvb->id); + kfree(cxl_nvb); +} + +static const struct attribute_group *cxl_nvdimm_bridge_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_nvdimm_bridge_type = { + .name = "cxl_nvdimm_bridge", + .release = cxl_nvdimm_bridge_release, + .groups = cxl_nvdimm_bridge_attribute_groups, +}; + +struct cxl_nvdimm_bridge *to_cxl_nvdimm_bridge(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_nvdimm_bridge_type, + "not a cxl_nvdimm_bridge device\n")) + return NULL; + return container_of(dev, struct cxl_nvdimm_bridge, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm_bridge, "CXL"); + +/** + * cxl_find_nvdimm_bridge() - find a bridge device relative to a port + * @port: any descendant port of an nvdimm-bridge associated + * root-cxl-port + */ +struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port) +{ + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + struct device *dev; + + if (!cxl_root) + return NULL; + + dev = device_find_child(&cxl_root->port.dev, + &cxl_nvdimm_bridge_type, + device_match_type); + + if (!dev) + return NULL; + + return to_cxl_nvdimm_bridge(dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, "CXL"); + +static struct lock_class_key cxl_nvdimm_bridge_key; + +static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + cxl_nvb = kzalloc(sizeof(*cxl_nvb), GFP_KERNEL); + if (!cxl_nvb) + return ERR_PTR(-ENOMEM); + + rc = ida_alloc(&cxl_nvdimm_bridge_ida, GFP_KERNEL); + if (rc < 0) + goto err; + cxl_nvb->id = rc; + + dev = &cxl_nvb->dev; + cxl_nvb->port = port; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_nvdimm_bridge_key); + device_set_pm_not_required(dev); + dev->parent = &port->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_nvdimm_bridge_type; + + return cxl_nvb; + +err: + kfree(cxl_nvb); + return ERR_PTR(rc); +} + +static void unregister_nvb(void *_cxl_nvb) +{ + struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb; + + device_unregister(&cxl_nvb->dev); +} + +/** + * devm_cxl_add_nvdimm_bridge() - add the root of a LIBNVDIMM topology + * @host: platform firmware root device + * @port: CXL port at the root of a CXL topology + * + * Return: bridge device that can host cxl_nvdimm objects + */ +struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, + struct cxl_port *port) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + if (!IS_ENABLED(CONFIG_CXL_PMEM)) + return ERR_PTR(-ENXIO); + + cxl_nvb = cxl_nvdimm_bridge_alloc(port); + if (IS_ERR(cxl_nvb)) + return cxl_nvb; + + dev = &cxl_nvb->dev; + rc = dev_set_name(dev, "nvdimm-bridge%d", cxl_nvb->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(host, unregister_nvb, cxl_nvb); + if (rc) + return ERR_PTR(rc); + + return cxl_nvb; + +err: + put_device(dev); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm_bridge, "CXL"); + +static void cxl_nvdimm_release(struct device *dev) +{ + struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev); + + kfree(cxl_nvd); +} + +static const struct attribute_group *cxl_nvdimm_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_nvdimm_type = { + .name = "cxl_nvdimm", + .release = cxl_nvdimm_release, + .groups = cxl_nvdimm_attribute_groups, +}; + +bool is_cxl_nvdimm(struct device *dev) +{ + return dev->type == &cxl_nvdimm_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm, "CXL"); + +struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_nvdimm(dev), + "not a cxl_nvdimm device\n")) + return NULL; + return container_of(dev, struct cxl_nvdimm, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, "CXL"); + +static struct lock_class_key cxl_nvdimm_key; + +static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge *cxl_nvb, + struct cxl_memdev *cxlmd) +{ + struct cxl_nvdimm *cxl_nvd; + struct device *dev; + + cxl_nvd = kzalloc(sizeof(*cxl_nvd), GFP_KERNEL); + if (!cxl_nvd) + return ERR_PTR(-ENOMEM); + + dev = &cxl_nvd->dev; + cxl_nvd->cxlmd = cxlmd; + cxlmd->cxl_nvd = cxl_nvd; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_nvdimm_key); + device_set_pm_not_required(dev); + dev->parent = &cxlmd->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_nvdimm_type; + /* + * A "%llx" string is 17-bytes vs dimm_id that is max + * NVDIMM_KEY_DESC_LEN + */ + BUILD_BUG_ON(sizeof(cxl_nvd->dev_id) < 17 || + sizeof(cxl_nvd->dev_id) > NVDIMM_KEY_DESC_LEN); + sprintf(cxl_nvd->dev_id, "%llx", cxlmd->cxlds->serial); + + return cxl_nvd; +} + +static void cxlmd_release_nvdimm(void *_cxlmd) +{ + struct cxl_memdev *cxlmd = _cxlmd; + struct cxl_nvdimm *cxl_nvd = cxlmd->cxl_nvd; + struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb; + + cxl_nvd->cxlmd = NULL; + cxlmd->cxl_nvd = NULL; + cxlmd->cxl_nvb = NULL; + device_unregister(&cxl_nvd->dev); + put_device(&cxl_nvb->dev); +} + +/** + * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm + * @parent_port: parent port for the (to be added) @cxlmd endpoint port + * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations + * + * Return: 0 on success negative error code on failure. + */ +int devm_cxl_add_nvdimm(struct cxl_port *parent_port, + struct cxl_memdev *cxlmd) +{ + struct cxl_nvdimm_bridge *cxl_nvb; + struct cxl_nvdimm *cxl_nvd; + struct device *dev; + int rc; + + cxl_nvb = cxl_find_nvdimm_bridge(parent_port); + if (!cxl_nvb) + return -ENODEV; + + cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd); + if (IS_ERR(cxl_nvd)) { + rc = PTR_ERR(cxl_nvd); + goto err_alloc; + } + cxlmd->cxl_nvb = cxl_nvb; + + dev = &cxl_nvd->dev; + rc = dev_set_name(dev, "pmem%d", cxlmd->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); + + /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ + return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd); + +err: + put_device(dev); +err_alloc: + cxlmd->cxl_nvb = NULL; + cxlmd->cxl_nvd = NULL; + put_device(&cxl_nvb->dev); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, "CXL"); diff --git a/drivers/cxl/core/pmu.c b/drivers/cxl/core/pmu.c new file mode 100644 index 000000000000..b3136d7664ab --- /dev/null +++ b/drivers/cxl/core/pmu.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 Huawei. All rights reserved. */ + +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <cxlmem.h> +#include <pmu.h> +#include <cxl.h> +#include "core.h" + +static void cxl_pmu_release(struct device *dev) +{ + struct cxl_pmu *pmu = to_cxl_pmu(dev); + + kfree(pmu); +} + +const struct device_type cxl_pmu_type = { + .name = "cxl_pmu", + .release = cxl_pmu_release, +}; + +static void remove_dev(void *dev) +{ + device_unregister(dev); +} + +int devm_cxl_pmu_add(struct device *parent, struct cxl_pmu_regs *regs, + int assoc_id, int index, enum cxl_pmu_type type) +{ + struct cxl_pmu *pmu; + struct device *dev; + int rc; + + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); + if (!pmu) + return -ENOMEM; + + pmu->assoc_id = assoc_id; + pmu->index = index; + pmu->type = type; + pmu->base = regs->pmu; + dev = &pmu->dev; + device_initialize(dev); + device_set_pm_not_required(dev); + dev->parent = parent; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmu_type; + switch (pmu->type) { + case CXL_PMU_MEMDEV: + rc = dev_set_name(dev, "pmu_mem%d.%d", assoc_id, index); + break; + } + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + return devm_add_action_or_reset(parent, remove_dev, dev); + +err: + put_device(&pmu->dev); + return rc; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_pmu_add, "CXL"); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c new file mode 100644 index 000000000000..fef3aa0c6680 --- /dev/null +++ b/drivers/cxl/core/port.c @@ -0,0 +1,2532 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. All rights reserved. */ +#include <linux/platform_device.h> +#include <linux/memregion.h> +#include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/node.h> +#include <cxl/einj.h> +#include <cxlmem.h> +#include <cxlpci.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core + * + * The CXL core provides a set of interfaces that can be consumed by CXL aware + * drivers. The interfaces allow for creation, modification, and destruction of + * regions, memory devices, ports, and decoders. CXL aware drivers must register + * with the CXL core via these interfaces in order to be able to participate in + * cross-device interleave coordination. The CXL core also establishes and + * maintains the bridge to the nvdimm subsystem. + * + * CXL core introduces sysfs hierarchy to control the devices that are + * instantiated by the core. + */ + +static DEFINE_IDA(cxl_port_ida); +static DEFINE_XARRAY(cxl_root_buses); + +/* + * The terminal device in PCI is NULL and @platform_bus + * for platform devices (for cxl_test) + */ +static bool is_cxl_host_bridge(struct device *dev) +{ + return (!dev || dev == &platform_bus); +} + +int cxl_num_decoders_committed(struct cxl_port *port) +{ + lockdep_assert_held(&cxl_rwsem.region); + + return port->commit_end + 1; +} + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static int cxl_device_id(const struct device *dev) +{ + if (dev->type == &cxl_nvdimm_bridge_type) + return CXL_DEVICE_NVDIMM_BRIDGE; + if (dev->type == &cxl_nvdimm_type) + return CXL_DEVICE_NVDIMM; + if (dev->type == CXL_PMEM_REGION_TYPE()) + return CXL_DEVICE_PMEM_REGION; + if (dev->type == CXL_DAX_REGION_TYPE()) + return CXL_DEVICE_DAX_REGION; + if (is_cxl_port(dev)) { + if (is_cxl_root(to_cxl_port(dev))) + return CXL_DEVICE_ROOT; + return CXL_DEVICE_PORT; + } + if (is_cxl_memdev(dev)) + return CXL_DEVICE_MEMORY_EXPANDER; + if (dev->type == CXL_REGION_TYPE()) + return CXL_DEVICE_REGION; + if (dev->type == &cxl_pmu_type) + return CXL_DEVICE_PMU; + return 0; +} + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, CXL_MODALIAS_FMT "\n", cxl_device_id(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static struct attribute *cxl_base_attributes[] = { + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, +}; + +struct attribute_group cxl_base_attribute_group = { + .attrs = cxl_base_attributes, +}; + +static ssize_t start_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%#llx\n", cxld->hpa_range.start); +} +static DEVICE_ATTR_ADMIN_RO(start); + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%#llx\n", range_len(&cxld->hpa_range)); +} +static DEVICE_ATTR_RO(size); + +#define CXL_DECODER_FLAG_ATTR(name, flag) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cxl_decoder *cxld = to_cxl_decoder(dev); \ + \ + return sysfs_emit(buf, "%s\n", \ + (cxld->flags & (flag)) ? "1" : "0"); \ +} \ +static DEVICE_ATTR_RO(name) + +CXL_DECODER_FLAG_ATTR(cap_pmem, CXL_DECODER_F_PMEM); +CXL_DECODER_FLAG_ATTR(cap_ram, CXL_DECODER_F_RAM); +CXL_DECODER_FLAG_ATTR(cap_type2, CXL_DECODER_F_TYPE2); +CXL_DECODER_FLAG_ATTR(cap_type3, CXL_DECODER_F_TYPE3); +CXL_DECODER_FLAG_ATTR(locked, CXL_DECODER_F_LOCK); + +static ssize_t target_type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + switch (cxld->target_type) { + case CXL_DECODER_DEVMEM: + return sysfs_emit(buf, "accelerator\n"); + case CXL_DECODER_HOSTONLYMEM: + return sysfs_emit(buf, "expander\n"); + } + return -ENXIO; +} +static DEVICE_ATTR_RO(target_type); + +static ssize_t emit_target_list(struct cxl_switch_decoder *cxlsd, char *buf) +{ + struct cxl_decoder *cxld = &cxlsd->cxld; + ssize_t offset = 0; + int i, rc = 0; + + for (i = 0; i < cxld->interleave_ways; i++) { + struct cxl_dport *dport = cxlsd->target[i]; + struct cxl_dport *next = NULL; + + if (!dport) + break; + + if (i + 1 < cxld->interleave_ways) + next = cxlsd->target[i + 1]; + rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id, + next ? "," : ""); + if (rc < 0) + return rc; + offset += rc; + } + + return offset; +} + +static ssize_t target_list_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + ssize_t offset; + int rc; + + guard(rwsem_read)(&cxl_rwsem.region); + rc = emit_target_list(cxlsd, buf); + if (rc < 0) + return rc; + offset = rc; + + rc = sysfs_emit_at(buf, offset, "\n"); + if (rc < 0) + return rc; + + return offset + rc; +} +static DEVICE_ATTR_RO(target_list); + +static ssize_t mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + /* without @cxl_rwsem.dpa, make sure @part is not reloaded */ + int part = READ_ONCE(cxled->part); + const char *desc; + + if (part < 0) + desc = "none"; + else + desc = cxlds->part[part].res.name; + + return sysfs_emit(buf, "%s\n", desc); +} + +static ssize_t mode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + enum cxl_partition_mode mode; + ssize_t rc; + + if (sysfs_streq(buf, "pmem")) + mode = CXL_PARTMODE_PMEM; + else if (sysfs_streq(buf, "ram")) + mode = CXL_PARTMODE_RAM; + else + return -EINVAL; + + rc = cxl_dpa_set_part(cxled, mode); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + guard(rwsem_read)(&cxl_rwsem.dpa); + return sysfs_emit(buf, "%#llx\n", (u64)cxl_dpa_resource_start(cxled)); +} +static DEVICE_ATTR_RO(dpa_resource); + +static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + resource_size_t size = cxl_dpa_size(cxled); + + return sysfs_emit(buf, "%pa\n", &size); +} + +static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + unsigned long long size; + ssize_t rc; + + rc = kstrtoull(buf, 0, &size); + if (rc) + return rc; + + if (!IS_ALIGNED(size, SZ_256M)) + return -EINVAL; + + rc = cxl_dpa_free(cxled); + if (rc) + return rc; + + if (size == 0) + return len; + + rc = cxl_dpa_alloc(cxled, size); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(dpa_size); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_granularity); +} + +static DEVICE_ATTR_RO(interleave_granularity); + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_ways); +} + +static DEVICE_ATTR_RO(interleave_ways); + +static ssize_t qos_class_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxlrd->qos_class); +} +static DEVICE_ATTR_RO(qos_class); + +static struct attribute *cxl_decoder_base_attrs[] = { + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_locked.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_interleave_ways.attr, + NULL, +}; + +static struct attribute_group cxl_decoder_base_attribute_group = { + .attrs = cxl_decoder_base_attrs, +}; + +static struct attribute *cxl_decoder_root_attrs[] = { + &dev_attr_cap_pmem.attr, + &dev_attr_cap_ram.attr, + &dev_attr_cap_type2.attr, + &dev_attr_cap_type3.attr, + &dev_attr_target_list.attr, + &dev_attr_qos_class.attr, + SET_CXL_REGION_ATTR(create_pmem_region) + SET_CXL_REGION_ATTR(create_ram_region) + SET_CXL_REGION_ATTR(delete_region) + NULL, +}; + +static bool can_create_pmem(struct cxl_root_decoder *cxlrd) +{ + unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM; + + return (cxlrd->cxlsd.cxld.flags & flags) == flags; +} + +static bool can_create_ram(struct cxl_root_decoder *cxlrd) +{ + unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_RAM; + + return (cxlrd->cxlsd.cxld.flags & flags) == flags; +} + +static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd)) + return 0; + + if (a == CXL_REGION_ATTR(create_ram_region) && !can_create_ram(cxlrd)) + return 0; + + if (a == CXL_REGION_ATTR(delete_region) && + !(can_create_pmem(cxlrd) || can_create_ram(cxlrd))) + return 0; + + return a->mode; +} + +static struct attribute_group cxl_decoder_root_attribute_group = { + .attrs = cxl_decoder_root_attrs, + .is_visible = cxl_root_decoder_visible, +}; + +static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { + &cxl_decoder_root_attribute_group, + &cxl_decoder_base_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static struct attribute *cxl_decoder_switch_attrs[] = { + &dev_attr_target_type.attr, + &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(region) + NULL, +}; + +static struct attribute_group cxl_decoder_switch_attribute_group = { + .attrs = cxl_decoder_switch_attrs, +}; + +static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { + &cxl_decoder_switch_attribute_group, + &cxl_decoder_base_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static struct attribute *cxl_decoder_endpoint_attrs[] = { + &dev_attr_target_type.attr, + &dev_attr_mode.attr, + &dev_attr_dpa_size.attr, + &dev_attr_dpa_resource.attr, + SET_CXL_REGION_ATTR(region) + NULL, +}; + +static struct attribute_group cxl_decoder_endpoint_attribute_group = { + .attrs = cxl_decoder_endpoint_attrs, +}; + +static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = { + &cxl_decoder_base_attribute_group, + &cxl_decoder_endpoint_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + +static void __cxl_decoder_release(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + + ida_free(&port->decoder_ida, cxld->id); + put_device(&port->dev); +} + +static void cxl_endpoint_decoder_release(struct device *dev) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + __cxl_decoder_release(&cxled->cxld); + kfree(cxled); +} + +static void cxl_switch_decoder_release(struct device *dev) +{ + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + __cxl_decoder_release(&cxlsd->cxld); + kfree(cxlsd); +} + +struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_root_decoder(dev), + "not a cxl_root_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_root_decoder, cxlsd.cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_root_decoder, "CXL"); + +static void cxl_root_decoder_release(struct device *dev) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (atomic_read(&cxlrd->region_id) >= 0) + memregion_free(atomic_read(&cxlrd->region_id)); + __cxl_decoder_release(&cxlrd->cxlsd.cxld); + kfree(cxlrd); +} + +static const struct device_type cxl_decoder_endpoint_type = { + .name = "cxl_decoder_endpoint", + .release = cxl_endpoint_decoder_release, + .groups = cxl_decoder_endpoint_attribute_groups, +}; + +static const struct device_type cxl_decoder_switch_type = { + .name = "cxl_decoder_switch", + .release = cxl_switch_decoder_release, + .groups = cxl_decoder_switch_attribute_groups, +}; + +static const struct device_type cxl_decoder_root_type = { + .name = "cxl_decoder_root", + .release = cxl_root_decoder_release, + .groups = cxl_decoder_root_attribute_groups, +}; + +bool is_endpoint_decoder(struct device *dev) +{ + return dev->type == &cxl_decoder_endpoint_type; +} +EXPORT_SYMBOL_NS_GPL(is_endpoint_decoder, "CXL"); + +bool is_root_decoder(struct device *dev) +{ + return dev->type == &cxl_decoder_root_type; +} +EXPORT_SYMBOL_NS_GPL(is_root_decoder, "CXL"); + +bool is_switch_decoder(struct device *dev) +{ + return is_root_decoder(dev) || dev->type == &cxl_decoder_switch_type; +} +EXPORT_SYMBOL_NS_GPL(is_switch_decoder, "CXL"); + +struct cxl_decoder *to_cxl_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, + !is_switch_decoder(dev) && !is_endpoint_decoder(dev), + "not a cxl_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_decoder, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, "CXL"); + +struct cxl_endpoint_decoder *to_cxl_endpoint_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_endpoint_decoder(dev), + "not a cxl_endpoint_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_endpoint_decoder, cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_endpoint_decoder, "CXL"); + +struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_switch_decoder(dev), + "not a cxl_switch_decoder device\n")) + return NULL; + return container_of(dev, struct cxl_switch_decoder, cxld.dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_switch_decoder, "CXL"); + +static void cxl_ep_release(struct cxl_ep *ep) +{ + put_device(ep->ep); + kfree(ep); +} + +static void cxl_ep_remove(struct cxl_port *port, struct cxl_ep *ep) +{ + if (!ep) + return; + xa_erase(&port->endpoints, (unsigned long) ep->ep); + cxl_ep_release(ep); +} + +static void cxl_port_release(struct device *dev) +{ + struct cxl_port *port = to_cxl_port(dev); + unsigned long index; + struct cxl_ep *ep; + + xa_for_each(&port->endpoints, index, ep) + cxl_ep_remove(port, ep); + xa_destroy(&port->endpoints); + xa_destroy(&port->dports); + xa_destroy(&port->regions); + ida_free(&cxl_port_ida, port->id); + if (is_cxl_root(port)) + kfree(to_cxl_root(port)); + else + kfree(port); +} + +static ssize_t decoders_committed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_port *port = to_cxl_port(dev); + + guard(rwsem_read)(&cxl_rwsem.region); + return sysfs_emit(buf, "%d\n", cxl_num_decoders_committed(port)); +} + +static DEVICE_ATTR_RO(decoders_committed); + +static struct attribute *cxl_port_attrs[] = { + &dev_attr_decoders_committed.attr, + NULL, +}; + +static struct attribute_group cxl_port_attribute_group = { + .attrs = cxl_port_attrs, +}; + +static const struct attribute_group *cxl_port_attribute_groups[] = { + &cxl_base_attribute_group, + &cxl_port_attribute_group, + NULL, +}; + +static const struct device_type cxl_port_type = { + .name = "cxl_port", + .release = cxl_port_release, + .groups = cxl_port_attribute_groups, +}; + +bool is_cxl_port(const struct device *dev) +{ + return dev->type == &cxl_port_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_port, "CXL"); + +struct cxl_port *to_cxl_port(const struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type, + "not a cxl_port device\n")) + return NULL; + return container_of(dev, struct cxl_port, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_port, "CXL"); + +struct cxl_port *parent_port_of(struct cxl_port *port) +{ + if (!port || !port->parent_dport) + return NULL; + return port->parent_dport->port; +} + +static void unregister_port(void *_port) +{ + struct cxl_port *port = _port; + struct cxl_port *parent = parent_port_of(port); + struct device *lock_dev; + + /* + * CXL root port's and the first level of ports are unregistered + * under the platform firmware device lock, all other ports are + * unregistered while holding their parent port lock. + */ + if (!parent) + lock_dev = port->uport_dev; + else if (is_cxl_root(parent)) + lock_dev = parent->uport_dev; + else + lock_dev = &parent->dev; + + device_lock_assert(lock_dev); + port->dead = true; + device_unregister(&port->dev); +} + +static void cxl_unlink_uport(void *_port) +{ + struct cxl_port *port = _port; + + sysfs_remove_link(&port->dev.kobj, "uport"); +} + +static int devm_cxl_link_uport(struct device *host, struct cxl_port *port) +{ + int rc; + + rc = sysfs_create_link(&port->dev.kobj, &port->uport_dev->kobj, + "uport"); + if (rc) + return rc; + return devm_add_action_or_reset(host, cxl_unlink_uport, port); +} + +static void cxl_unlink_parent_dport(void *_port) +{ + struct cxl_port *port = _port; + + sysfs_remove_link(&port->dev.kobj, "parent_dport"); +} + +static int devm_cxl_link_parent_dport(struct device *host, + struct cxl_port *port, + struct cxl_dport *parent_dport) +{ + int rc; + + if (!parent_dport) + return 0; + + rc = sysfs_create_link(&port->dev.kobj, &parent_dport->dport_dev->kobj, + "parent_dport"); + if (rc) + return rc; + return devm_add_action_or_reset(host, cxl_unlink_parent_dport, port); +} + +static struct lock_class_key cxl_port_key; + +static struct cxl_port *cxl_port_alloc(struct device *uport_dev, + struct cxl_dport *parent_dport) +{ + struct cxl_root *cxl_root __free(kfree) = NULL; + struct cxl_port *port, *_port __free(kfree) = NULL; + struct device *dev; + int rc; + + /* No parent_dport, root cxl_port */ + if (!parent_dport) { + cxl_root = kzalloc(sizeof(*cxl_root), GFP_KERNEL); + if (!cxl_root) + return ERR_PTR(-ENOMEM); + } else { + _port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!_port) + return ERR_PTR(-ENOMEM); + } + + rc = ida_alloc(&cxl_port_ida, GFP_KERNEL); + if (rc < 0) + return ERR_PTR(rc); + + if (cxl_root) + port = &no_free_ptr(cxl_root)->port; + else + port = no_free_ptr(_port); + + port->id = rc; + port->uport_dev = uport_dev; + + /* + * The top-level cxl_port "cxl_root" does not have a cxl_port as + * its parent and it does not have any corresponding component + * registers as its decode is described by a fixed platform + * description. + */ + dev = &port->dev; + if (parent_dport) { + struct cxl_port *parent_port = parent_dport->port; + struct cxl_port *iter; + + dev->parent = &parent_port->dev; + port->depth = parent_port->depth + 1; + port->parent_dport = parent_dport; + + /* + * walk to the host bridge, or the first ancestor that knows + * the host bridge + */ + iter = port; + while (!iter->host_bridge && + !is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + if (iter->host_bridge) + port->host_bridge = iter->host_bridge; + else if (parent_dport->rch) + port->host_bridge = parent_dport->dport_dev; + else + port->host_bridge = iter->uport_dev; + dev_dbg(uport_dev, "host-bridge: %s\n", + dev_name(port->host_bridge)); + } else + dev->parent = uport_dev; + + ida_init(&port->decoder_ida); + port->hdm_end = -1; + port->commit_end = -1; + xa_init(&port->dports); + xa_init(&port->endpoints); + xa_init(&port->regions); + port->component_reg_phys = CXL_RESOURCE_NONE; + + device_initialize(dev); + lockdep_set_class_and_subclass(&dev->mutex, &cxl_port_key, port->depth); + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_port_type; + + return port; +} + +static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map, + resource_size_t component_reg_phys) +{ + *map = (struct cxl_register_map) { + .host = host, + .reg_type = CXL_REGLOC_RBI_EMPTY, + .resource = component_reg_phys, + }; + + if (component_reg_phys == CXL_RESOURCE_NONE) + return 0; + + map->reg_type = CXL_REGLOC_RBI_COMPONENT; + map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE; + + return cxl_setup_regs(map); +} + +static int cxl_port_setup_regs(struct cxl_port *port, + resource_size_t component_reg_phys) +{ + if (dev_is_platform(port->uport_dev)) + return 0; + return cxl_setup_comp_regs(&port->dev, &port->reg_map, + component_reg_phys); +} + +static int cxl_dport_setup_regs(struct device *host, struct cxl_dport *dport, + resource_size_t component_reg_phys) +{ + int rc; + + if (dev_is_platform(dport->dport_dev)) + return 0; + + /* + * use @dport->dport_dev for the context for error messages during + * register probing, and fixup @host after the fact, since @host may be + * NULL. + */ + rc = cxl_setup_comp_regs(dport->dport_dev, &dport->reg_map, + component_reg_phys); + dport->reg_map.host = host; + return rc; +} + +DEFINE_SHOW_ATTRIBUTE(einj_cxl_available_error_type); + +static int cxl_einj_inject(void *data, u64 type) +{ + struct cxl_dport *dport = data; + + if (dport->rch) + return einj_cxl_inject_rch_error(dport->rcrb.base, type); + + return einj_cxl_inject_error(to_pci_dev(dport->dport_dev), type); +} +DEFINE_DEBUGFS_ATTRIBUTE(cxl_einj_inject_fops, NULL, cxl_einj_inject, + "0x%llx\n"); + +static void cxl_debugfs_create_dport_dir(struct cxl_dport *dport) +{ + struct dentry *dir; + + if (!einj_cxl_is_initialized()) + return; + + /* + * dport_dev needs to be a PCIe port for CXL 2.0+ ports because + * EINJ expects a dport SBDF to be specified for 2.0 error injection. + */ + if (!dport->rch && !dev_is_pci(dport->dport_dev)) + return; + + dir = cxl_debugfs_create_dir(dev_name(dport->dport_dev)); + + debugfs_create_file("einj_inject", 0200, dir, dport, + &cxl_einj_inject_fops); +} + +static int cxl_port_add(struct cxl_port *port, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct device *dev __free(put_device) = &port->dev; + int rc; + + if (is_cxl_memdev(port->uport_dev)) { + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + rc = dev_set_name(dev, "endpoint%d", port->id); + if (rc) + return rc; + + /* + * The endpoint driver already enumerated the component and RAS + * registers. Reuse that enumeration while prepping them to be + * mapped by the cxl_port driver. + */ + port->reg_map = cxlds->reg_map; + port->reg_map.host = &port->dev; + cxlmd->endpoint = port; + } else if (parent_dport) { + rc = dev_set_name(dev, "port%d", port->id); + if (rc) + return rc; + + port->component_reg_phys = component_reg_phys; + } else { + rc = dev_set_name(dev, "root%d", port->id); + if (rc) + return rc; + } + + rc = device_add(dev); + if (rc) + return rc; + + /* Inhibit the cleanup function invoked */ + dev = NULL; + return 0; +} + +static struct cxl_port *__devm_cxl_add_port(struct device *host, + struct device *uport_dev, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct cxl_port *port; + int rc; + + port = cxl_port_alloc(uport_dev, parent_dport); + if (IS_ERR(port)) + return port; + + rc = cxl_port_add(port, component_reg_phys, parent_dport); + if (rc) + return ERR_PTR(rc); + + rc = devm_add_action_or_reset(host, unregister_port, port); + if (rc) + return ERR_PTR(rc); + + rc = devm_cxl_link_uport(host, port); + if (rc) + return ERR_PTR(rc); + + rc = devm_cxl_link_parent_dport(host, port, parent_dport); + if (rc) + return ERR_PTR(rc); + + if (parent_dport && dev_is_pci(uport_dev)) + port->pci_latency = cxl_pci_get_latency(to_pci_dev(uport_dev)); + + return port; +} + +/** + * devm_cxl_add_port - register a cxl_port in CXL memory decode hierarchy + * @host: host device for devm operations + * @uport_dev: "physical" device implementing this upstream port + * @component_reg_phys: (optional) for configurable cxl_port instances + * @parent_dport: next hop up in the CXL memory decode hierarchy + */ +struct cxl_port *devm_cxl_add_port(struct device *host, + struct device *uport_dev, + resource_size_t component_reg_phys, + struct cxl_dport *parent_dport) +{ + struct cxl_port *port, *parent_port; + + port = __devm_cxl_add_port(host, uport_dev, component_reg_phys, + parent_dport); + + parent_port = parent_dport ? parent_dport->port : NULL; + if (IS_ERR(port)) { + dev_dbg(uport_dev, "Failed to add%s%s%s: %ld\n", + parent_port ? " port to " : "", + parent_port ? dev_name(&parent_port->dev) : "", + parent_port ? "" : " root port", + PTR_ERR(port)); + } else { + dev_dbg(uport_dev, "%s added%s%s%s\n", + dev_name(&port->dev), + parent_port ? " to " : "", + parent_port ? dev_name(&parent_port->dev) : "", + parent_port ? "" : " (root port)"); + } + + return port; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, "CXL"); + +struct cxl_root *devm_cxl_add_root(struct device *host, + const struct cxl_root_ops *ops) +{ + struct cxl_root *cxl_root; + struct cxl_port *port; + + port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); + if (IS_ERR(port)) + return ERR_CAST(port); + + cxl_root = to_cxl_root(port); + cxl_root->ops = ops; + return cxl_root; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_root, "CXL"); + +struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port) +{ + /* There is no pci_bus associated with a CXL platform-root port */ + if (is_cxl_root(port)) + return NULL; + + if (dev_is_pci(port->uport_dev)) { + struct pci_dev *pdev = to_pci_dev(port->uport_dev); + + return pdev->subordinate; + } + + return xa_load(&cxl_root_buses, (unsigned long)port->uport_dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_port_to_pci_bus, "CXL"); + +static void unregister_pci_bus(void *uport_dev) +{ + xa_erase(&cxl_root_buses, (unsigned long)uport_dev); +} + +int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev, + struct pci_bus *bus) +{ + int rc; + + if (dev_is_pci(uport_dev)) + return -EINVAL; + + rc = xa_insert(&cxl_root_buses, (unsigned long)uport_dev, bus, + GFP_KERNEL); + if (rc) + return rc; + return devm_add_action_or_reset(host, unregister_pci_bus, uport_dev); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, "CXL"); + +static bool dev_is_cxl_root_child(struct device *dev) +{ + struct cxl_port *port, *parent; + + if (!is_cxl_port(dev)) + return false; + + port = to_cxl_port(dev); + if (is_cxl_root(port)) + return false; + + parent = to_cxl_port(port->dev.parent); + if (is_cxl_root(parent)) + return true; + + return false; +} + +struct cxl_root *find_cxl_root(struct cxl_port *port) +{ + struct cxl_port *iter = port; + + while (iter && !is_cxl_root(iter)) + iter = to_cxl_port(iter->dev.parent); + + if (!iter) + return NULL; + get_device(&iter->dev); + return to_cxl_root(iter); +} +EXPORT_SYMBOL_NS_GPL(find_cxl_root, "CXL"); + +static struct cxl_dport *find_dport(struct cxl_port *port, int id) +{ + struct cxl_dport *dport; + unsigned long index; + + device_lock_assert(&port->dev); + xa_for_each(&port->dports, index, dport) + if (dport->port_id == id) + return dport; + return NULL; +} + +static int add_dport(struct cxl_port *port, struct cxl_dport *dport) +{ + struct cxl_dport *dup; + int rc; + + device_lock_assert(&port->dev); + dup = find_dport(port, dport->port_id); + if (dup) { + dev_err(&port->dev, + "unable to add dport%d-%s non-unique port id (%s)\n", + dport->port_id, dev_name(dport->dport_dev), + dev_name(dup->dport_dev)); + return -EBUSY; + } + + rc = xa_insert(&port->dports, (unsigned long)dport->dport_dev, dport, + GFP_KERNEL); + if (rc) + return rc; + + port->nr_dports++; + return 0; +} + +/* + * Since root-level CXL dports cannot be enumerated by PCI they are not + * enumerated by the common port driver that acquires the port lock over + * dport add/remove. Instead, root dports are manually added by a + * platform driver and cond_cxl_root_lock() is used to take the missing + * port lock in that case. + */ +static void cond_cxl_root_lock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + device_lock(&port->dev); +} + +static void cond_cxl_root_unlock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + device_unlock(&port->dev); +} + +static void cxl_dport_remove(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + + xa_erase(&port->dports, (unsigned long) dport->dport_dev); + put_device(dport->dport_dev); +} + +static void cxl_dport_unlink(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + char link_name[CXL_TARGET_STRLEN]; + + sprintf(link_name, "dport%d", dport->port_id); + sysfs_remove_link(&port->dev.kobj, link_name); +} + +static struct cxl_dport * +__devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, + int port_id, resource_size_t component_reg_phys, + resource_size_t rcrb) +{ + char link_name[CXL_TARGET_STRLEN]; + struct cxl_dport *dport; + struct device *host; + int rc; + + if (is_cxl_root(port)) + host = port->uport_dev; + else + host = &port->dev; + + if (!host->driver) { + dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n", + dev_name(dport_dev)); + return ERR_PTR(-ENXIO); + } + + if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >= + CXL_TARGET_STRLEN) + return ERR_PTR(-EINVAL); + + dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL); + if (!dport) + return ERR_PTR(-ENOMEM); + + dport->dport_dev = dport_dev; + dport->port_id = port_id; + dport->port = port; + + if (rcrb == CXL_RESOURCE_NONE) { + rc = cxl_dport_setup_regs(&port->dev, dport, + component_reg_phys); + if (rc) + return ERR_PTR(rc); + } else { + dport->rcrb.base = rcrb; + component_reg_phys = __rcrb_to_component(dport_dev, &dport->rcrb, + CXL_RCRB_DOWNSTREAM); + if (component_reg_phys == CXL_RESOURCE_NONE) { + dev_warn(dport_dev, "Invalid Component Registers in RCRB"); + return ERR_PTR(-ENXIO); + } + + /* + * RCH @dport is not ready to map until associated with its + * memdev + */ + rc = cxl_dport_setup_regs(NULL, dport, component_reg_phys); + if (rc) + return ERR_PTR(rc); + + dport->rch = true; + } + + if (component_reg_phys != CXL_RESOURCE_NONE) + dev_dbg(dport_dev, "Component Registers found for dport: %pa\n", + &component_reg_phys); + + cond_cxl_root_lock(port); + rc = add_dport(port, dport); + cond_cxl_root_unlock(port); + if (rc) + return ERR_PTR(rc); + + /* + * Setup port register if this is the first dport showed up. Having + * a dport also means that there is at least 1 active link. + */ + if (port->nr_dports == 1 && + port->component_reg_phys != CXL_RESOURCE_NONE) { + rc = cxl_port_setup_regs(port, port->component_reg_phys); + if (rc) { + xa_erase(&port->dports, (unsigned long)dport->dport_dev); + return ERR_PTR(rc); + } + port->component_reg_phys = CXL_RESOURCE_NONE; + } + + get_device(dport_dev); + rc = devm_add_action_or_reset(host, cxl_dport_remove, dport); + if (rc) + return ERR_PTR(rc); + + rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name); + if (rc) + return ERR_PTR(rc); + + rc = devm_add_action_or_reset(host, cxl_dport_unlink, dport); + if (rc) + return ERR_PTR(rc); + + if (dev_is_pci(dport_dev)) + dport->link_latency = cxl_pci_get_latency(to_pci_dev(dport_dev)); + + cxl_debugfs_create_dport_dir(dport); + + return dport; +} + +/** + * devm_cxl_add_dport - append VH downstream port data to a cxl_port + * @port: the cxl_port that references this dport + * @dport_dev: firmware or PCI device representing the dport + * @port_id: identifier for this dport in a decoder's target list + * @component_reg_phys: optional location of CXL component registers + * + * Note that dports are appended to the devm release action's of the + * either the port's host (for root ports), or the port itself (for + * switch ports) + */ +struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, + struct device *dport_dev, int port_id, + resource_size_t component_reg_phys) +{ + struct cxl_dport *dport; + + dport = __devm_cxl_add_dport(port, dport_dev, port_id, + component_reg_phys, CXL_RESOURCE_NONE); + if (IS_ERR(dport)) { + dev_dbg(dport_dev, "failed to add dport to %s: %ld\n", + dev_name(&port->dev), PTR_ERR(dport)); + } else { + dev_dbg(dport_dev, "dport added to %s\n", + dev_name(&port->dev)); + } + + return dport; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, "CXL"); + +/** + * devm_cxl_add_rch_dport - append RCH downstream port data to a cxl_port + * @port: the cxl_port that references this dport + * @dport_dev: firmware or PCI device representing the dport + * @port_id: identifier for this dport in a decoder's target list + * @rcrb: mandatory location of a Root Complex Register Block + * + * See CXL 3.0 9.11.8 CXL Devices Attached to an RCH + */ +struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, + struct device *dport_dev, int port_id, + resource_size_t rcrb) +{ + struct cxl_dport *dport; + + if (rcrb == CXL_RESOURCE_NONE) { + dev_dbg(&port->dev, "failed to add RCH dport, missing RCRB\n"); + return ERR_PTR(-EINVAL); + } + + dport = __devm_cxl_add_dport(port, dport_dev, port_id, + CXL_RESOURCE_NONE, rcrb); + if (IS_ERR(dport)) { + dev_dbg(dport_dev, "failed to add RCH dport to %s: %ld\n", + dev_name(&port->dev), PTR_ERR(dport)); + } else { + dev_dbg(dport_dev, "RCH dport added to %s\n", + dev_name(&port->dev)); + } + + return dport; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_rch_dport, "CXL"); + +static int add_ep(struct cxl_ep *new) +{ + struct cxl_port *port = new->dport->port; + + guard(device)(&port->dev); + if (port->dead) + return -ENXIO; + + return xa_insert(&port->endpoints, (unsigned long)new->ep, + new, GFP_KERNEL); +} + +/** + * cxl_add_ep - register an endpoint's interest in a port + * @dport: the dport that routes to @ep_dev + * @ep_dev: device representing the endpoint + * + * Intermediate CXL ports are scanned based on the arrival of endpoints. + * When those endpoints depart the port can be destroyed once all + * endpoints that care about that port have been removed. + */ +static int cxl_add_ep(struct cxl_dport *dport, struct device *ep_dev) +{ + struct cxl_ep *ep; + int rc; + + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + ep->ep = get_device(ep_dev); + ep->dport = dport; + + rc = add_ep(ep); + if (rc) + cxl_ep_release(ep); + return rc; +} + +struct cxl_find_port_ctx { + const struct device *dport_dev; + const struct cxl_port *parent_port; + struct cxl_dport **dport; +}; + +static int match_port_by_dport(struct device *dev, const void *data) +{ + const struct cxl_find_port_ctx *ctx = data; + struct cxl_dport *dport; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + if (ctx->parent_port && dev->parent != &ctx->parent_port->dev) + return 0; + + port = to_cxl_port(dev); + dport = cxl_find_dport_by_dev(port, ctx->dport_dev); + if (ctx->dport) + *ctx->dport = dport; + return dport != NULL; +} + +static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx) +{ + struct device *dev; + + if (!ctx->dport_dev) + return NULL; + + dev = bus_find_device(&cxl_bus_type, NULL, ctx, match_port_by_dport); + if (dev) + return to_cxl_port(dev); + return NULL; +} + +static struct cxl_port *find_cxl_port(struct device *dport_dev, + struct cxl_dport **dport) +{ + struct cxl_find_port_ctx ctx = { + .dport_dev = dport_dev, + .dport = dport, + }; + struct cxl_port *port; + + port = __find_cxl_port(&ctx); + return port; +} + +/* + * All users of grandparent() are using it to walk PCIe-like switch port + * hierarchy. A PCIe switch is comprised of a bridge device representing the + * upstream switch port and N bridges representing downstream switch ports. When + * bridges stack the grand-parent of a downstream switch port is another + * downstream switch port in the immediate ancestor switch. + */ +static struct device *grandparent(struct device *dev) +{ + if (dev && dev->parent) + return dev->parent->parent; + return NULL; +} + +static struct device *endpoint_host(struct cxl_port *endpoint) +{ + struct cxl_port *port = to_cxl_port(endpoint->dev.parent); + + if (is_cxl_root(port)) + return port->uport_dev; + return &port->dev; +} + +static void delete_endpoint(void *data) +{ + struct cxl_memdev *cxlmd = data; + struct cxl_port *endpoint = cxlmd->endpoint; + struct device *host = endpoint_host(endpoint); + + scoped_guard(device, host) { + if (host->driver && !endpoint->dead) { + devm_release_action(host, cxl_unlink_parent_dport, endpoint); + devm_release_action(host, cxl_unlink_uport, endpoint); + devm_release_action(host, unregister_port, endpoint); + } + cxlmd->endpoint = NULL; + } + put_device(&endpoint->dev); + put_device(host); +} + +int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint) +{ + struct device *host = endpoint_host(endpoint); + struct device *dev = &cxlmd->dev; + + get_device(host); + get_device(&endpoint->dev); + cxlmd->depth = endpoint->depth; + return devm_add_action_or_reset(dev, delete_endpoint, cxlmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, "CXL"); + +/* + * The natural end of life of a non-root 'cxl_port' is when its parent port goes + * through a ->remove() event ("top-down" unregistration). The unnatural trigger + * for a port to be unregistered is when all memdevs beneath that port have gone + * through ->remove(). This "bottom-up" removal selectively removes individual + * child ports manually. This depends on devm_cxl_add_port() to not change is + * devm action registration order, and for dports to have already been + * destroyed by del_dports(). + */ +static void delete_switch_port(struct cxl_port *port) +{ + devm_release_action(port->dev.parent, cxl_unlink_parent_dport, port); + devm_release_action(port->dev.parent, cxl_unlink_uport, port); + devm_release_action(port->dev.parent, unregister_port, port); +} + +static void del_dport(struct cxl_dport *dport) +{ + struct cxl_port *port = dport->port; + + devm_release_action(&port->dev, cxl_dport_unlink, dport); + devm_release_action(&port->dev, cxl_dport_remove, dport); + devm_kfree(&port->dev, dport); +} + +static void del_dports(struct cxl_port *port) +{ + struct cxl_dport *dport; + unsigned long index; + + device_lock_assert(&port->dev); + + xa_for_each(&port->dports, index, dport) + del_dport(dport); +} + +struct detach_ctx { + struct cxl_memdev *cxlmd; + int depth; +}; + +static int port_has_memdev(struct device *dev, const void *data) +{ + const struct detach_ctx *ctx = data; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + + port = to_cxl_port(dev); + if (port->depth != ctx->depth) + return 0; + + return !!cxl_ep_load(port, ctx->cxlmd); +} + +static void cxl_detach_ep(void *data) +{ + struct cxl_memdev *cxlmd = data; + + for (int i = cxlmd->depth - 1; i >= 1; i--) { + struct cxl_port *port, *parent_port; + struct detach_ctx ctx = { + .cxlmd = cxlmd, + .depth = i, + }; + struct cxl_ep *ep; + bool died = false; + + struct device *dev __free(put_device) = + bus_find_device(&cxl_bus_type, NULL, &ctx, port_has_memdev); + if (!dev) + continue; + port = to_cxl_port(dev); + + parent_port = to_cxl_port(port->dev.parent); + device_lock(&parent_port->dev); + device_lock(&port->dev); + ep = cxl_ep_load(port, cxlmd); + dev_dbg(&cxlmd->dev, "disconnect %s from %s\n", + ep ? dev_name(ep->ep) : "", dev_name(&port->dev)); + cxl_ep_remove(port, ep); + if (ep && !port->dead && xa_empty(&port->endpoints) && + !is_cxl_root(parent_port) && parent_port->dev.driver) { + /* + * This was the last ep attached to a dynamically + * enumerated port. Block new cxl_add_ep() and garbage + * collect the port. + */ + died = true; + port->dead = true; + del_dports(port); + } + device_unlock(&port->dev); + + if (died) { + dev_dbg(&cxlmd->dev, "delete %s\n", + dev_name(&port->dev)); + delete_switch_port(port); + } + device_unlock(&parent_port->dev); + } +} + +static resource_size_t find_component_registers(struct device *dev) +{ + struct cxl_register_map map; + struct pci_dev *pdev; + + /* + * Theoretically, CXL component registers can be hosted on a + * non-PCI device, in practice, only cxl_test hits this case. + */ + if (!dev_is_pci(dev)) + return CXL_RESOURCE_NONE; + + pdev = to_pci_dev(dev); + + cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + return map.resource; +} + +static int match_port_by_uport(struct device *dev, const void *data) +{ + const struct device *uport_dev = data; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + + port = to_cxl_port(dev); + return uport_dev == port->uport_dev; +} + +/* + * Function takes a device reference on the port device. Caller should do a + * put_device() when done. + */ +static struct cxl_port *find_cxl_port_by_uport(struct device *uport_dev) +{ + struct device *dev; + + dev = bus_find_device(&cxl_bus_type, NULL, uport_dev, match_port_by_uport); + if (dev) + return to_cxl_port(dev); + return NULL; +} + +static int update_decoder_targets(struct device *dev, void *data) +{ + struct cxl_dport *dport = data; + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int i; + + if (!is_switch_decoder(dev)) + return 0; + + cxlsd = to_cxl_switch_decoder(dev); + cxld = &cxlsd->cxld; + guard(rwsem_write)(&cxl_rwsem.region); + + for (i = 0; i < cxld->interleave_ways; i++) { + if (cxld->target_map[i] == dport->port_id) { + cxlsd->target[i] = dport; + dev_dbg(dev, "dport%d found in target list, index %d\n", + dport->port_id, i); + return 1; + } + } + + return 0; +} + +DEFINE_FREE(del_cxl_dport, struct cxl_dport *, if (!IS_ERR_OR_NULL(_T)) del_dport(_T)) +static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_dport *dport; + int rc; + + device_lock_assert(&port->dev); + if (!port->dev.driver) + return ERR_PTR(-ENXIO); + + dport = cxl_find_dport_by_dev(port, dport_dev); + if (dport) { + dev_dbg(&port->dev, "dport%d:%s already exists\n", + dport->port_id, dev_name(dport_dev)); + return ERR_PTR(-EBUSY); + } + + struct cxl_dport *new_dport __free(del_cxl_dport) = + devm_cxl_add_dport_by_dev(port, dport_dev); + if (IS_ERR(new_dport)) + return new_dport; + + cxl_switch_parse_cdat(new_dport); + + if (ida_is_empty(&port->decoder_ida)) { + rc = devm_cxl_switch_port_decoders_setup(port); + if (rc) + return ERR_PTR(rc); + dev_dbg(&port->dev, "first dport%d:%s added with decoders\n", + new_dport->port_id, dev_name(dport_dev)); + return no_free_ptr(new_dport); + } + + /* New dport added, update the decoder targets */ + device_for_each_child(&port->dev, new_dport, update_decoder_targets); + + dev_dbg(&port->dev, "dport%d:%s added\n", new_dport->port_id, + dev_name(dport_dev)); + + return no_free_ptr(new_dport); +} + +static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, + struct cxl_port *parent_port, + struct cxl_dport *parent_dport, + struct device *uport_dev, + struct device *dport_dev) +{ + resource_size_t component_reg_phys; + + device_lock_assert(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_warn(ep_dev, + "port %s:%s:%s disabled, failed to enumerate CXL.mem\n", + dev_name(&parent_port->dev), dev_name(uport_dev), + dev_name(dport_dev)); + } + + struct cxl_port *port __free(put_cxl_port) = + find_cxl_port_by_uport(uport_dev); + if (!port) { + component_reg_phys = find_component_registers(uport_dev); + port = devm_cxl_add_port(&parent_port->dev, uport_dev, + component_reg_phys, parent_dport); + if (IS_ERR(port)) + return ERR_CAST(port); + + /* + * retry to make sure a port is found. a port device + * reference is taken. + */ + port = find_cxl_port_by_uport(uport_dev); + if (!port) + return ERR_PTR(-ENODEV); + + dev_dbg(ep_dev, "created port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport_dev)); + } else { + /* + * Port was created before right before this function is + * called. Signal the caller to deal with it. + */ + return ERR_PTR(-EAGAIN); + } + + guard(device)(&port->dev); + return cxl_port_add_dport(port, dport_dev); +} + +static int add_port_attach_ep(struct cxl_memdev *cxlmd, + struct device *uport_dev, + struct device *dport_dev) +{ + struct device *dparent = grandparent(dport_dev); + struct cxl_dport *dport, *parent_dport; + int rc; + + if (is_cxl_host_bridge(dparent)) { + /* + * The iteration reached the topology root without finding the + * CXL-root 'cxl_port' on a previous iteration, fail for now to + * be re-probed after platform driver attaches. + */ + dev_dbg(&cxlmd->dev, "%s is a root dport\n", + dev_name(dport_dev)); + return -ENXIO; + } + + struct cxl_port *parent_port __free(put_cxl_port) = + find_cxl_port_by_uport(dparent->parent); + if (!parent_port) { + /* iterate to create this parent_port */ + return -EAGAIN; + } + + scoped_guard(device, &parent_port->dev) { + parent_dport = cxl_find_dport_by_dev(parent_port, dparent); + if (!parent_dport) { + parent_dport = cxl_port_add_dport(parent_port, dparent); + if (IS_ERR(parent_dport)) + return PTR_ERR(parent_dport); + } + + dport = devm_cxl_create_port(&cxlmd->dev, parent_port, + parent_dport, uport_dev, + dport_dev); + if (IS_ERR(dport)) { + /* Port already exists, restart iteration */ + if (PTR_ERR(dport) == -EAGAIN) + return 0; + return PTR_ERR(dport); + } + } + + rc = cxl_add_ep(dport, &cxlmd->dev); + if (rc == -EBUSY) { + /* + * "can't" happen, but this error code means + * something to the caller, so translate it. + */ + rc = -ENXIO; + } + + return rc; +} + +static struct cxl_dport *find_or_add_dport(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_dport *dport; + + device_lock_assert(&port->dev); + dport = cxl_find_dport_by_dev(port, dport_dev); + if (!dport) { + dport = cxl_port_add_dport(port, dport_dev); + if (IS_ERR(dport)) + return dport; + + /* New dport added, restart iteration */ + return ERR_PTR(-EAGAIN); + } + + return dport; +} + +int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) +{ + struct device *dev = &cxlmd->dev; + struct device *iter; + int rc; + + /* + * Skip intermediate port enumeration in the RCH case, there + * are no ports in between a host bridge and an endpoint. + */ + if (cxlmd->cxlds->rcd) + return 0; + + rc = devm_add_action_or_reset(&cxlmd->dev, cxl_detach_ep, cxlmd); + if (rc) + return rc; + + /* + * Scan for and add all cxl_ports in this device's ancestry. + * Repeat until no more ports are added. Abort if a port add + * attempt fails. + */ +retry: + for (iter = dev; iter; iter = grandparent(iter)) { + struct device *dport_dev = grandparent(iter); + struct device *uport_dev; + struct cxl_dport *dport; + + if (is_cxl_host_bridge(dport_dev)) + return 0; + + uport_dev = dport_dev->parent; + if (!uport_dev) { + dev_warn(dev, "at %s no parent for dport: %s\n", + dev_name(iter), dev_name(dport_dev)); + return -ENXIO; + } + + dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n", + dev_name(iter), dev_name(dport_dev), + dev_name(uport_dev)); + struct cxl_port *port __free(put_cxl_port) = + find_cxl_port_by_uport(uport_dev); + if (port) { + dev_dbg(&cxlmd->dev, + "found already registered port %s:%s\n", + dev_name(&port->dev), + dev_name(port->uport_dev)); + + /* + * RP port enumerated by cxl_acpi without dport will + * have the dport added here. + */ + scoped_guard(device, &port->dev) { + dport = find_or_add_dport(port, dport_dev); + if (IS_ERR(dport)) { + if (PTR_ERR(dport) == -EAGAIN) + goto retry; + return PTR_ERR(dport); + } + } + + rc = cxl_add_ep(dport, &cxlmd->dev); + + /* + * If the endpoint already exists in the port's list, + * that's ok, it was added on a previous pass. + * Otherwise, retry in add_port_attach_ep() after taking + * the parent_port lock as the current port may be being + * reaped. + */ + if (rc && rc != -EBUSY) + return rc; + + cxl_gpf_port_setup(dport); + + /* Any more ports to add between this one and the root? */ + if (!dev_is_cxl_root_child(&port->dev)) + continue; + + return 0; + } + + rc = add_port_attach_ep(cxlmd, uport_dev, dport_dev); + /* port missing, try to add parent */ + if (rc == -EAGAIN) + continue; + /* failed to add ep or port */ + if (rc) + return rc; + /* port added, new descendants possible, start over */ + goto retry; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, "CXL"); + +struct cxl_port *cxl_pci_find_port(struct pci_dev *pdev, + struct cxl_dport **dport) +{ + return find_cxl_port(pdev->dev.parent, dport); +} +EXPORT_SYMBOL_NS_GPL(cxl_pci_find_port, "CXL"); + +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport) +{ + return find_cxl_port(grandparent(&cxlmd->dev), dport); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, "CXL"); + +static int decoder_populate_targets(struct cxl_switch_decoder *cxlsd, + struct cxl_port *port) +{ + struct cxl_decoder *cxld = &cxlsd->cxld; + int i; + + device_lock_assert(&port->dev); + + if (xa_empty(&port->dports)) + return 0; + + guard(rwsem_write)(&cxl_rwsem.region); + for (i = 0; i < cxlsd->cxld.interleave_ways; i++) { + struct cxl_dport *dport = find_dport(port, cxld->target_map[i]); + + if (!dport) { + /* dport may be activated later */ + continue; + } + cxlsd->target[i] = dport; + } + + return 0; +} + +static struct lock_class_key cxl_decoder_key; + +/** + * cxl_decoder_init - Common decoder setup / initialization + * @port: owning port of this decoder + * @cxld: common decoder properties to initialize + * + * A port may contain one or more decoders. Each of those decoders + * enable some address space for CXL.mem utilization. A decoder is + * expected to be configured by the caller before registering via + * cxl_decoder_add() + */ +static int cxl_decoder_init(struct cxl_port *port, struct cxl_decoder *cxld) +{ + struct device *dev; + int rc; + + rc = ida_alloc(&port->decoder_ida, GFP_KERNEL); + if (rc < 0) + return rc; + + /* need parent to stick around to release the id */ + get_device(&port->dev); + cxld->id = rc; + + dev = &cxld->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_decoder_key); + device_set_pm_not_required(dev); + dev->parent = &port->dev; + dev->bus = &cxl_bus_type; + + /* Pre initialize an "empty" decoder */ + cxld->interleave_ways = 1; + cxld->interleave_granularity = PAGE_SIZE; + cxld->target_type = CXL_DECODER_HOSTONLYMEM; + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; + + return 0; +} + +static int cxl_switch_decoder_init(struct cxl_port *port, + struct cxl_switch_decoder *cxlsd, + int nr_targets) +{ + if (nr_targets > CXL_DECODER_MAX_INTERLEAVE) + return -EINVAL; + + cxlsd->nr_targets = nr_targets; + return cxl_decoder_init(port, &cxlsd->cxld); +} + +/** + * cxl_root_decoder_alloc - Allocate a root level decoder + * @port: owning CXL root of this decoder + * @nr_targets: static number of downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'CXL root' decoder is one that decodes from a top-level / static platform + * firmware description of CXL resources into a CXL standard decode + * topology. + */ +struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + struct cxl_root_decoder *cxlrd; + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + + if (!is_cxl_root(port)) + return ERR_PTR(-EINVAL); + + cxlrd = kzalloc(struct_size(cxlrd, cxlsd.target, nr_targets), + GFP_KERNEL); + if (!cxlrd) + return ERR_PTR(-ENOMEM); + + cxlsd = &cxlrd->cxlsd; + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlrd); + return ERR_PTR(rc); + } + + mutex_init(&cxlrd->range_lock); + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_root_type; + /* + * cxl_root_decoder_release() special cases negative ids to + * detect memregion_alloc() failures. + */ + atomic_set(&cxlrd->region_id, -1); + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) { + put_device(&cxld->dev); + return ERR_PTR(rc); + } + + atomic_set(&cxlrd->region_id, rc); + cxlrd->qos_class = CXL_QOS_CLASS_INVALID; + return cxlrd; +} +EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, "CXL"); + +/** + * cxl_switch_decoder_alloc - Allocate a switch level decoder + * @port: owning CXL switch port of this decoder + * @nr_targets: max number of dynamically addressable downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'switch' decoder is any decoder that can be enumerated by PCIe + * topology and the HDM Decoder Capability. This includes the decoders + * that sit between Switch Upstream Ports / Switch Downstream Ports and + * Host Bridges / Root Ports. + */ +struct cxl_switch_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_decoder *cxld; + int rc; + + if (is_cxl_root(port) || is_cxl_endpoint(port)) + return ERR_PTR(-EINVAL); + + cxlsd = kzalloc(struct_size(cxlsd, target, nr_targets), GFP_KERNEL); + if (!cxlsd) + return ERR_PTR(-ENOMEM); + + rc = cxl_switch_decoder_init(port, cxlsd, nr_targets); + if (rc) { + kfree(cxlsd); + return ERR_PTR(rc); + } + + cxld = &cxlsd->cxld; + cxld->dev.type = &cxl_decoder_switch_type; + return cxlsd; +} +EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, "CXL"); + +/** + * cxl_endpoint_decoder_alloc - Allocate an endpoint decoder + * @port: owning port of this decoder + * + * Return: A new cxl decoder to be registered by cxl_decoder_add() + */ +struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) +{ + struct cxl_endpoint_decoder *cxled; + struct cxl_decoder *cxld; + int rc; + + if (!is_cxl_endpoint(port)) + return ERR_PTR(-EINVAL); + + cxled = kzalloc(sizeof(*cxled), GFP_KERNEL); + if (!cxled) + return ERR_PTR(-ENOMEM); + + cxled->pos = -1; + cxled->part = -1; + cxld = &cxled->cxld; + rc = cxl_decoder_init(port, cxld); + if (rc) { + kfree(cxled); + return ERR_PTR(rc); + } + + cxld->dev.type = &cxl_decoder_endpoint_type; + return cxled; +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, "CXL"); + +/** + * cxl_decoder_add_locked - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() + * + * Certain types of decoders may not have any targets. The main example of this + * is an endpoint device. A more awkward example is a hostbridge whose root + * ports get hot added (technically possible, though unlikely). + * + * This is the locked variant of cxl_decoder_add(). + * + * Context: Process context. Expects the device lock of the port that owns the + * @cxld to be held. + * + * Return: Negative error code if the decoder wasn't properly configured; else + * returns 0. + */ +int cxl_decoder_add_locked(struct cxl_decoder *cxld) +{ + struct cxl_port *port; + struct device *dev; + int rc; + + if (WARN_ON_ONCE(!cxld)) + return -EINVAL; + + if (WARN_ON_ONCE(IS_ERR(cxld))) + return PTR_ERR(cxld); + + if (cxld->interleave_ways < 1) + return -EINVAL; + + dev = &cxld->dev; + + port = to_cxl_port(cxld->dev.parent); + if (!is_endpoint_decoder(dev)) { + struct cxl_switch_decoder *cxlsd = to_cxl_switch_decoder(dev); + + rc = decoder_populate_targets(cxlsd, port); + if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) { + dev_err(&port->dev, + "Failed to populate active decoder targets\n"); + return rc; + } + } + + rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id); + if (rc) + return rc; + + return device_add(dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, "CXL"); + +/** + * cxl_decoder_add - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_<type>_decoder_alloc() + * + * This is the unlocked variant of cxl_decoder_add_locked(). + * See cxl_decoder_add_locked(). + * + * Context: Process context. Takes and releases the device lock of the port that + * owns the @cxld. + */ +int cxl_decoder_add(struct cxl_decoder *cxld) +{ + struct cxl_port *port; + + if (WARN_ON_ONCE(!cxld)) + return -EINVAL; + + if (WARN_ON_ONCE(IS_ERR(cxld))) + return PTR_ERR(cxld); + + port = to_cxl_port(cxld->dev.parent); + + guard(device)(&port->dev); + return cxl_decoder_add_locked(cxld); +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, "CXL"); + +static void cxld_unregister(void *dev) +{ + if (is_endpoint_decoder(dev)) + cxl_decoder_detach(NULL, to_cxl_endpoint_decoder(dev), -1, + DETACH_INVALIDATE); + + device_unregister(dev); +} + +int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld) +{ + return devm_add_action_or_reset(host, cxld_unregister, &cxld->dev); +} +EXPORT_SYMBOL_NS_GPL(cxl_decoder_autoremove, "CXL"); + +/** + * __cxl_driver_register - register a driver for the cxl bus + * @cxl_drv: cxl driver structure to attach + * @owner: owning module/driver + * @modname: KBUILD_MODNAME for parent driver + */ +int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner, + const char *modname) +{ + if (!cxl_drv->probe) { + pr_debug("%s ->probe() must be specified\n", modname); + return -EINVAL; + } + + if (!cxl_drv->name) { + pr_debug("%s ->name must be specified\n", modname); + return -EINVAL; + } + + if (!cxl_drv->id) { + pr_debug("%s ->id must be specified\n", modname); + return -EINVAL; + } + + cxl_drv->drv.bus = &cxl_bus_type; + cxl_drv->drv.owner = owner; + cxl_drv->drv.mod_name = modname; + cxl_drv->drv.name = cxl_drv->name; + + return driver_register(&cxl_drv->drv); +} +EXPORT_SYMBOL_NS_GPL(__cxl_driver_register, "CXL"); + +void cxl_driver_unregister(struct cxl_driver *cxl_drv) +{ + driver_unregister(&cxl_drv->drv); +} +EXPORT_SYMBOL_NS_GPL(cxl_driver_unregister, "CXL"); + +static int cxl_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) +{ + return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT, + cxl_device_id(dev)); +} + +static int cxl_bus_match(struct device *dev, const struct device_driver *drv) +{ + return cxl_device_id(dev) == to_cxl_drv(drv)->id; +} + +static int cxl_bus_probe(struct device *dev) +{ + int rc; + + rc = to_cxl_drv(dev->driver)->probe(dev); + dev_dbg(dev, "probe: %d\n", rc); + return rc; +} + +static void cxl_bus_remove(struct device *dev) +{ + struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver); + + if (cxl_drv->remove) + cxl_drv->remove(dev); +} + +static struct workqueue_struct *cxl_bus_wq; + +static int cxl_rescan_attach(struct device *dev, void *data) +{ + int rc = device_attach(dev); + + dev_vdbg(dev, "rescan: %s\n", rc ? "attach" : "detached"); + + return 0; +} + +static void cxl_bus_rescan_queue(struct work_struct *w) +{ + bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_rescan_attach); +} + +void cxl_bus_rescan(void) +{ + static DECLARE_WORK(rescan_work, cxl_bus_rescan_queue); + + queue_work(cxl_bus_wq, &rescan_work); +} +EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, "CXL"); + +void cxl_bus_drain(void) +{ + drain_workqueue(cxl_bus_wq); +} +EXPORT_SYMBOL_NS_GPL(cxl_bus_drain, "CXL"); + +bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) +{ + return queue_work(cxl_bus_wq, &cxlmd->detach_work); +} +EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, "CXL"); + +static void add_latency(struct access_coordinate *c, long latency) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_latency += latency; + c[i].read_latency += latency; + } +} + +static bool coordinates_valid(struct access_coordinate *c) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (c[i].read_bandwidth && c[i].write_bandwidth && + c[i].read_latency && c[i].write_latency) + continue; + return false; + } + + return true; +} + +static void set_min_bandwidth(struct access_coordinate *c, unsigned int bw) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].write_bandwidth = min(c[i].write_bandwidth, bw); + c[i].read_bandwidth = min(c[i].read_bandwidth, bw); + } +} + +static void set_access_coordinates(struct access_coordinate *out, + struct access_coordinate *in) +{ + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) + out[i] = in[i]; +} + +static bool parent_port_is_cxl_root(struct cxl_port *port) +{ + return is_cxl_root(to_cxl_port(port->dev.parent)); +} + +/** + * cxl_endpoint_get_perf_coordinates - Retrieve performance numbers stored in dports + * of CXL path + * @port: endpoint cxl_port + * @coord: output performance data + * + * Return: errno on failure, 0 on success. + */ +int cxl_endpoint_get_perf_coordinates(struct cxl_port *port, + struct access_coordinate *coord) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct access_coordinate c[] = { + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + { + .read_bandwidth = UINT_MAX, + .write_bandwidth = UINT_MAX, + }, + }; + struct cxl_port *iter = port; + struct cxl_dport *dport; + struct pci_dev *pdev; + struct device *dev; + unsigned int bw; + bool is_cxl_root; + + if (!is_cxl_endpoint(port)) + return -EINVAL; + + /* + * Skip calculation for RCD. Expectation is HMAT already covers RCD case + * since RCH does not support hotplug. + */ + if (cxlmd->cxlds->rcd) + return 0; + + /* + * Exit the loop when the parent port of the current iter port is cxl + * root. The iterative loop starts at the endpoint and gathers the + * latency of the CXL link from the current device/port to the connected + * downstream port each iteration. + */ + do { + dport = iter->parent_dport; + iter = to_cxl_port(iter->dev.parent); + is_cxl_root = parent_port_is_cxl_root(iter); + + /* + * There's no valid access_coordinate for a root port since RPs do not + * have CDAT and therefore needs to be skipped. + */ + if (!is_cxl_root) { + if (!coordinates_valid(dport->coord)) + return -EINVAL; + cxl_coordinates_combine(c, c, dport->coord); + } + add_latency(c, dport->link_latency); + } while (!is_cxl_root); + + dport = iter->parent_dport; + /* Retrieve HB coords */ + if (!coordinates_valid(dport->coord)) + return -EINVAL; + cxl_coordinates_combine(c, c, dport->coord); + + dev = port->uport_dev->parent; + if (!dev_is_pci(dev)) + return -ENODEV; + + /* Get the calculated PCI paths bandwidth */ + pdev = to_pci_dev(dev); + bw = pcie_bandwidth_available(pdev, NULL, NULL, NULL); + if (bw == 0) + return -ENXIO; + bw /= BITS_PER_BYTE; + + set_min_bandwidth(c, bw); + set_access_coordinates(coord, c); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_get_perf_coordinates, "CXL"); + +int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, + struct access_coordinate *c) +{ + struct cxl_dport *dport = port->parent_dport; + + /* Check this port is connected to a switch DSP and not an RP */ + if (parent_port_is_cxl_root(to_cxl_port(port->dev.parent))) + return -ENODEV; + + if (!coordinates_valid(dport->coord)) + return -EINVAL; + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + c[i].read_bandwidth = dport->coord[i].read_bandwidth; + c[i].write_bandwidth = dport->coord[i].write_bandwidth; + } + + return 0; +} + +/* for user tooling to ensure port disable work has completed */ +static ssize_t flush_store(const struct bus_type *bus, const char *buf, size_t count) +{ + if (sysfs_streq(buf, "1")) { + flush_workqueue(cxl_bus_wq); + return count; + } + + return -EINVAL; +} + +static BUS_ATTR_WO(flush); + +static struct attribute *cxl_bus_attributes[] = { + &bus_attr_flush.attr, + NULL, +}; + +static struct attribute_group cxl_bus_attribute_group = { + .attrs = cxl_bus_attributes, +}; + +static const struct attribute_group *cxl_bus_attribute_groups[] = { + &cxl_bus_attribute_group, + NULL, +}; + +const struct bus_type cxl_bus_type = { + .name = "cxl", + .uevent = cxl_bus_uevent, + .match = cxl_bus_match, + .probe = cxl_bus_probe, + .remove = cxl_bus_remove, + .bus_groups = cxl_bus_attribute_groups, +}; +EXPORT_SYMBOL_NS_GPL(cxl_bus_type, "CXL"); + +static struct dentry *cxl_debugfs; + +struct dentry *cxl_debugfs_create_dir(const char *dir) +{ + return debugfs_create_dir(dir, cxl_debugfs); +} +EXPORT_SYMBOL_NS_GPL(cxl_debugfs_create_dir, "CXL"); + +static __init int cxl_core_init(void) +{ + int rc; + + cxl_debugfs = debugfs_create_dir("cxl", NULL); + + if (einj_cxl_is_initialized()) + debugfs_create_file("einj_types", 0400, cxl_debugfs, NULL, + &einj_cxl_available_error_type_fops); + + cxl_mbox_init(); + + rc = cxl_memdev_init(); + if (rc) + return rc; + + cxl_bus_wq = alloc_ordered_workqueue("cxl_port", 0); + if (!cxl_bus_wq) { + rc = -ENOMEM; + goto err_wq; + } + + rc = bus_register(&cxl_bus_type); + if (rc) + goto err_bus; + + rc = cxl_region_init(); + if (rc) + goto err_region; + + rc = cxl_ras_init(); + if (rc) + goto err_ras; + + return 0; + +err_ras: + cxl_region_exit(); +err_region: + bus_unregister(&cxl_bus_type); +err_bus: + destroy_workqueue(cxl_bus_wq); +err_wq: + cxl_memdev_exit(); + return rc; +} + +static void cxl_core_exit(void) +{ + cxl_ras_exit(); + cxl_region_exit(); + bus_unregister(&cxl_bus_type); + destroy_workqueue(cxl_bus_wq); + cxl_memdev_exit(); + debugfs_remove_recursive(cxl_debugfs); +} + +subsys_initcall(cxl_core_init); +module_exit(cxl_core_exit); +MODULE_DESCRIPTION("CXL: Core Compute Express Link support"); +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS("CXL"); diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c new file mode 100644 index 000000000000..2731ba3a0799 --- /dev/null +++ b/drivers/cxl/core/ras.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include <linux/pci.h> +#include <linux/aer.h> +#include <cxl/event.h> +#include <cxlmem.h> +#include "trace.h" + +static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_port_aer_correctable_error(&pdev->dev, status); +} + +static void cxl_cper_trace_uncorr_port_prot_err(struct pci_dev *pdev, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_port_aer_uncorrectable_error(&pdev->dev, status, fe, + ras_cap.header_log); +} + +static void cxl_cper_trace_corr_prot_err(struct cxl_memdev *cxlmd, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.cor_status & ~ras_cap.cor_mask; + + trace_cxl_aer_correctable_error(cxlmd, status); +} + +static void +cxl_cper_trace_uncorr_prot_err(struct cxl_memdev *cxlmd, + struct cxl_ras_capability_regs ras_cap) +{ + u32 status = ras_cap.uncor_status & ~ras_cap.uncor_mask; + u32 fe; + + if (hweight32(status) > 1) + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + ras_cap.cap_control)); + else + fe = status; + + trace_cxl_aer_uncorrectable_error(cxlmd, status, fe, + ras_cap.header_log); +} + +static int match_memdev_by_parent(struct device *dev, const void *uport) +{ + if (is_cxl_memdev(dev) && dev->parent == uport) + return 1; + return 0; +} + +static void cxl_cper_handle_prot_err(struct cxl_cper_prot_err_work_data *data) +{ + unsigned int devfn = PCI_DEVFN(data->prot_err.agent_addr.device, + data->prot_err.agent_addr.function); + struct pci_dev *pdev __free(pci_dev_put) = + pci_get_domain_bus_and_slot(data->prot_err.agent_addr.segment, + data->prot_err.agent_addr.bus, + devfn); + struct cxl_memdev *cxlmd; + int port_type; + + if (!pdev) + return; + + port_type = pci_pcie_type(pdev); + if (port_type == PCI_EXP_TYPE_ROOT_PORT || + port_type == PCI_EXP_TYPE_DOWNSTREAM || + port_type == PCI_EXP_TYPE_UPSTREAM) { + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_port_prot_err(pdev, data->ras_cap); + else + cxl_cper_trace_uncorr_port_prot_err(pdev, data->ras_cap); + + return; + } + + guard(device)(&pdev->dev); + if (!pdev->dev.driver) + return; + + struct device *mem_dev __free(put_device) = bus_find_device( + &cxl_bus_type, NULL, pdev, match_memdev_by_parent); + if (!mem_dev) + return; + + cxlmd = to_cxl_memdev(mem_dev); + if (data->severity == AER_CORRECTABLE) + cxl_cper_trace_corr_prot_err(cxlmd, data->ras_cap); + else + cxl_cper_trace_uncorr_prot_err(cxlmd, data->ras_cap); +} + +static void cxl_cper_prot_err_work_fn(struct work_struct *work) +{ + struct cxl_cper_prot_err_work_data wd; + + while (cxl_cper_prot_err_kfifo_get(&wd)) + cxl_cper_handle_prot_err(&wd); +} +static DECLARE_WORK(cxl_cper_prot_err_work, cxl_cper_prot_err_work_fn); + +int cxl_ras_init(void) +{ + return cxl_cper_register_prot_err_work(&cxl_cper_prot_err_work); +} + +void cxl_ras_exit(void) +{ + cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); + cancel_work_sync(&cxl_cper_prot_err_work); +} diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c new file mode 100644 index 000000000000..ae899f68551f --- /dev/null +++ b/drivers/cxl/core/region.c @@ -0,0 +1,4003 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/memregion.h> +#include <linux/genalloc.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/module.h> +#include <linux/memory.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/sort.h> +#include <linux/idr.h> +#include <linux/memory-tiers.h> +#include <linux/string_choices.h> +#include <cxlmem.h> +#include <cxl.h> +#include "core.h" + +/** + * DOC: cxl core region + * + * CXL Regions represent mapped memory capacity in system physical address + * space. Whereas the CXL Root Decoders identify the bounds of potential CXL + * Memory ranges, Regions represent the active mapped capacity by the HDM + * Decoder Capability structures throughout the Host Bridges, Switches, and + * Endpoints in the topology. + * + * Region configuration has ordering constraints. UUID may be set at any time + * but is only visible for persistent regions. + * 1. Interleave granularity + * 2. Interleave size + * 3. Decoder targets + */ + +/* + * nodemask that sets per node when the access_coordinates for the node has + * been updated by the CXL memory hotplug notifier. + */ +static nodemask_t nodemask_region_seen = NODE_MASK_NONE; + +static struct cxl_region *to_cxl_region(struct device *dev); + +#define __ACCESS_ATTR_RO(_level, _name) { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = _name##_access##_level##_show, \ +} + +#define ACCESS_DEVICE_ATTR_RO(level, name) \ + struct device_attribute dev_attr_access##level##_##name = __ACCESS_ATTR_RO(level, name) + +#define ACCESS_ATTR_RO(level, attrib) \ +static ssize_t attrib##_access##level##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct cxl_region *cxlr = to_cxl_region(dev); \ + \ + if (cxlr->coord[level].attrib == 0) \ + return -ENOENT; \ + \ + return sysfs_emit(buf, "%u\n", cxlr->coord[level].attrib); \ +} \ +static ACCESS_DEVICE_ATTR_RO(level, attrib) + +ACCESS_ATTR_RO(0, read_bandwidth); +ACCESS_ATTR_RO(0, read_latency); +ACCESS_ATTR_RO(0, write_bandwidth); +ACCESS_ATTR_RO(0, write_latency); + +#define ACCESS_ATTR_DECLARE(level, attrib) \ + (&dev_attr_access##level##_##attrib.attr) + +static struct attribute *access0_coordinate_attrs[] = { + ACCESS_ATTR_DECLARE(0, read_bandwidth), + ACCESS_ATTR_DECLARE(0, write_bandwidth), + ACCESS_ATTR_DECLARE(0, read_latency), + ACCESS_ATTR_DECLARE(0, write_latency), + NULL +}; + +ACCESS_ATTR_RO(1, read_bandwidth); +ACCESS_ATTR_RO(1, read_latency); +ACCESS_ATTR_RO(1, write_bandwidth); +ACCESS_ATTR_RO(1, write_latency); + +static struct attribute *access1_coordinate_attrs[] = { + ACCESS_ATTR_DECLARE(1, read_bandwidth), + ACCESS_ATTR_DECLARE(1, write_bandwidth), + ACCESS_ATTR_DECLARE(1, read_latency), + ACCESS_ATTR_DECLARE(1, write_latency), + NULL +}; + +#define ACCESS_VISIBLE(level) \ +static umode_t cxl_region_access##level##_coordinate_visible( \ + struct kobject *kobj, struct attribute *a, int n) \ +{ \ + struct device *dev = kobj_to_dev(kobj); \ + struct cxl_region *cxlr = to_cxl_region(dev); \ + \ + if (a == &dev_attr_access##level##_read_latency.attr && \ + cxlr->coord[level].read_latency == 0) \ + return 0; \ + \ + if (a == &dev_attr_access##level##_write_latency.attr && \ + cxlr->coord[level].write_latency == 0) \ + return 0; \ + \ + if (a == &dev_attr_access##level##_read_bandwidth.attr && \ + cxlr->coord[level].read_bandwidth == 0) \ + return 0; \ + \ + if (a == &dev_attr_access##level##_write_bandwidth.attr && \ + cxlr->coord[level].write_bandwidth == 0) \ + return 0; \ + \ + return a->mode; \ +} + +ACCESS_VISIBLE(0); +ACCESS_VISIBLE(1); + +static const struct attribute_group cxl_region_access0_coordinate_group = { + .name = "access0", + .attrs = access0_coordinate_attrs, + .is_visible = cxl_region_access0_coordinate_visible, +}; + +static const struct attribute_group *get_cxl_region_access0_group(void) +{ + return &cxl_region_access0_coordinate_group; +} + +static const struct attribute_group cxl_region_access1_coordinate_group = { + .name = "access1", + .attrs = access1_coordinate_attrs, + .is_visible = cxl_region_access1_coordinate_visible, +}; + +static const struct attribute_group *get_cxl_region_access1_group(void) +{ + return &cxl_region_access1_coordinate_group; +} + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + if (cxlr->mode != CXL_PARTMODE_PMEM) + return sysfs_emit(buf, "\n"); + return sysfs_emit(buf, "%pUb\n", &p->uuid); +} + +static int is_dup(struct device *match, void *data) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + uuid_t *uuid = data; + + if (!is_cxl_region(match)) + return 0; + + lockdep_assert_held(&cxl_rwsem.region); + cxlr = to_cxl_region(match); + p = &cxlr->params; + + if (uuid_equal(&p->uuid, uuid)) { + dev_dbg(match, "already has uuid: %pUb\n", uuid); + return -EBUSY; + } + + return 0; +} + +static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + uuid_t temp; + ssize_t rc; + + if (len != UUID_STRING_LEN + 1) + return -EINVAL; + + rc = uuid_parse(buf, &temp); + if (rc) + return rc; + + if (uuid_is_null(&temp)) + return -EINVAL; + + ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, ®ion_rwsem))) + return rc; + + if (uuid_equal(&p->uuid, &temp)) + return len; + + if (p->state >= CXL_CONFIG_ACTIVE) + return -EBUSY; + + rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup); + if (rc < 0) + return rc; + + uuid_copy(&p->uuid, &temp); + + return len; +} +static DEVICE_ATTR_RW(uuid); + +static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port, + struct cxl_region *cxlr) +{ + return xa_load(&port->regions, (unsigned long)cxlr); +} + +static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) +{ + if (!cpu_cache_has_invalidate_memregion()) { + if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) { + dev_info_once( + &cxlr->dev, + "Bypassing cpu_cache_invalidate_memregion() for testing!\n"); + return 0; + } + dev_WARN(&cxlr->dev, + "Failed to synchronize CPU cache state\n"); + return -ENXIO; + } + + if (!cxlr->params.res) + return -ENXIO; + cpu_cache_invalidate_memregion(cxlr->params.res->start, + resource_size(cxlr->params.res)); + return 0; +} + +static void cxl_region_decode_reset(struct cxl_region *cxlr, int count) +{ + struct cxl_region_params *p = &cxlr->params; + int i; + + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return; + + /* + * Before region teardown attempt to flush, evict any data cached for + * this region, or scream loudly about missing arch / platform support + * for CXL teardown. + */ + cxl_region_invalidate_memregion(cxlr); + + for (i = count - 1; i >= 0; i--) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *iter = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_ep *ep; + + if (cxlds->rcd) + goto endpoint_reset; + + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + if (cxld->reset) + cxld->reset(cxld); + set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); + } + +endpoint_reset: + cxled->cxld.reset(&cxled->cxld); + set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); + } + + /* all decoders associated with this region have been torn down */ + clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); +} + +static int commit_decoder(struct cxl_decoder *cxld) +{ + struct cxl_switch_decoder *cxlsd = NULL; + + if (cxld->commit) + return cxld->commit(cxld); + + if (is_switch_decoder(&cxld->dev)) + cxlsd = to_cxl_switch_decoder(&cxld->dev); + + if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1, + "->commit() is required\n")) + return -ENXIO; + return 0; +} + +static int cxl_region_decode_commit(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i, rc = 0; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + struct cxl_port *iter; + struct cxl_ep *ep; + + /* commit bottom up */ + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + rc = commit_decoder(cxld); + if (rc) + break; + } + + if (rc) { + /* programming @iter failed, teardown */ + for (ep = cxl_ep_load(iter, cxlmd); ep && iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + if (cxld->reset) + cxld->reset(cxld); + } + + cxled->cxld.reset(&cxled->cxld); + goto err; + } + } + + return 0; + +err: + /* undo the targets that were successfully committed */ + cxl_region_decode_reset(cxlr, i); + return rc; +} + +static int queue_reset(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + /* Already in the requested state? */ + if (p->state < CXL_CONFIG_COMMIT) + return 0; + + p->state = CXL_CONFIG_RESET_PENDING; + + return 0; +} + +static int __commit(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + /* Already in the requested state? */ + if (p->state >= CXL_CONFIG_COMMIT) + return 0; + + /* Not ready to commit? */ + if (p->state < CXL_CONFIG_ACTIVE) + return -ENXIO; + + /* + * Invalidate caches before region setup to drop any speculative + * consumption of this address space + */ + rc = cxl_region_invalidate_memregion(cxlr); + if (rc) + return rc; + + rc = cxl_region_decode_commit(cxlr); + if (rc) + return rc; + + p->state = CXL_CONFIG_COMMIT; + + return 0; +} + +static ssize_t commit_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + bool commit; + ssize_t rc; + + rc = kstrtobool(buf, &commit); + if (rc) + return rc; + + if (commit) { + rc = __commit(cxlr); + if (rc) + return rc; + return len; + } + + if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags)) + return -EPERM; + + rc = queue_reset(cxlr); + if (rc) + return rc; + + /* + * Unmap the region and depend the reset-pending state to ensure + * it does not go active again until post reset + */ + device_release_driver(&cxlr->dev); + + /* + * With the reset pending take cxl_rwsem.region unconditionally + * to ensure the reset gets handled before returning. + */ + guard(rwsem_write)(&cxl_rwsem.region); + + /* + * Revalidate that the reset is still pending in case another + * thread already handled this reset. + */ + if (p->state == CXL_CONFIG_RESET_PENDING) { + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; + } + + return len; +} + +static ssize_t commit_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT); +} +static DEVICE_ATTR_RW(commit); + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%d\n", p->interleave_ways); +} + +static const struct attribute_group *get_cxl_region_target_group(void); + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + unsigned int val, save; + int rc; + u8 iw; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + + rc = ways_to_eiw(val, &iw); + if (rc) + return rc; + + /* + * Even for x3, x6, and x12 interleaves the region interleave must be a + * power of 2 multiple of the host bridge interleave. + */ + if (!is_power_of_2(val / cxld->interleave_ways) || + (val % cxld->interleave_ways)) { + dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val); + return -EINVAL; + } + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + save = p->interleave_ways; + p->interleave_ways = val; + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); + if (rc) { + p->interleave_ways = save; + return rc; + } + + return len; +} +static DEVICE_ATTR_RW(interleave_ways); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%d\n", p->interleave_granularity); +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc, val; + u16 ig; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + rc = granularity_to_eig(val, &ig); + if (rc) + return rc; + + /* + * When the host-bridge is interleaved, disallow region granularity != + * root granularity. Regions with a granularity less than the root + * interleave result in needing multiple endpoints to support a single + * slot in the interleave (possible to support in the future). Regions + * with a granularity greater than the root interleave result in invalid + * DPA translations (invalid to support). + */ + if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity) + return -EINVAL; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + p->interleave_granularity = val; + + return len; +} +static DEVICE_ATTR_RW(interleave_granularity); + +static ssize_t resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 resource = -1ULL; + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + + if (p->res) + resource = p->res->start; + return sysfs_emit(buf, "%#llx\n", resource); +} +static DEVICE_ATTR_RO(resource); + +static ssize_t mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + const char *desc; + + if (cxlr->mode == CXL_PARTMODE_RAM) + desc = "ram"; + else if (cxlr->mode == CXL_PARTMODE_PMEM) + desc = "pmem"; + else + desc = ""; + + return sysfs_emit(buf, "%s\n", desc); +} +static DEVICE_ATTR_RO(mode); + +static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct resource *res; + u64 remainder = 0; + + lockdep_assert_held_write(&cxl_rwsem.region); + + /* Nothing to do... */ + if (p->res && resource_size(p->res) == size) + return 0; + + /* To change size the old size must be freed first */ + if (p->res) + return -EBUSY; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + /* ways, granularity and uuid (if PMEM) need to be set before HPA */ + if (!p->interleave_ways || !p->interleave_granularity || + (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid))) + return -ENXIO; + + div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder); + if (remainder) + return -EINVAL; + + res = alloc_free_mem_region(cxlrd->res, size, SZ_256M, + dev_name(&cxlr->dev)); + if (IS_ERR(res)) { + dev_dbg(&cxlr->dev, + "HPA allocation error (%ld) for size:%pap in %s %pr\n", + PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res); + return PTR_ERR(res); + } + + p->res = res; + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + + return 0; +} + +static void cxl_region_iomem_release(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + if (device_is_registered(&cxlr->dev)) + lockdep_assert_held_write(&cxl_rwsem.region); + if (p->res) { + /* + * Autodiscovered regions may not have been able to insert their + * resource. + */ + if (p->res->parent) + remove_resource(p->res); + kfree(p->res); + p->res = NULL; + } +} + +static int free_hpa(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + lockdep_assert_held_write(&cxl_rwsem.region); + + if (!p->res) + return 0; + + if (p->state >= CXL_CONFIG_ACTIVE) + return -EBUSY; + + cxl_region_iomem_release(cxlr); + p->state = CXL_CONFIG_IDLE; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + u64 val; + int rc; + + rc = kstrtou64(buf, 0, &val); + if (rc) + return rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + + if (val) + rc = alloc_hpa(cxlr, val); + else + rc = free_hpa(cxlr); + + if (rc) + return rc; + + return len; +} + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 size = 0; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + if (p->res) + size = resource_size(p->res); + return sysfs_emit(buf, "%#llx\n", size); +} +static DEVICE_ATTR_RW(size); + +static ssize_t extended_linear_cache_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + return sysfs_emit(buf, "%#llx\n", p->cache_size); +} +static DEVICE_ATTR_RO(extended_linear_cache_size); + +static struct attribute *cxl_region_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_commit.attr, + &dev_attr_interleave_ways.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_resource.attr, + &dev_attr_size.attr, + &dev_attr_mode.attr, + &dev_attr_extended_linear_cache_size.attr, + NULL, +}; + +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + /* + * Support tooling that expects to find a 'uuid' attribute for all + * regions regardless of mode. + */ + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) + return 0444; + + /* + * Don't display extended linear cache attribute if there is no + * extended linear cache. + */ + if (a == &dev_attr_extended_linear_cache_size.attr && + cxlr->params.cache_size == 0) + return 0; + + return a->mode; +} + +static const struct attribute_group cxl_region_group = { + .attrs = cxl_region_attrs, + .is_visible = cxl_region_visible, +}; + +static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + return -ENXIO; + } + + cxled = p->targets[pos]; + if (!cxled) + return sysfs_emit(buf, "\n"); + return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev)); +} + +static int check_commit_order(struct device *dev, void *data) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + /* + * if port->commit_end is not the only free decoder, then out of + * order shutdown has occurred, block further allocations until + * that is resolved + */ + if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) + return -EBUSY; + return 0; +} + +static int match_free_decoder(struct device *dev, const void *data) +{ + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_decoder *cxld; + int rc; + + if (!is_switch_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + + if (cxld->id != port->commit_end + 1) + return 0; + + if (cxld->region) { + dev_dbg(dev->parent, + "next decoder to commit (%s) is already reserved (%s)\n", + dev_name(dev), dev_name(&cxld->region->dev)); + return 0; + } + + rc = device_for_each_child_reverse_from(dev->parent, dev, NULL, + check_commit_order); + if (rc) { + dev_dbg(dev->parent, + "unable to allocate %s due to out of order shutdown\n", + dev_name(dev)); + return 0; + } + return 1; +} + +static bool spa_maps_hpa(const struct cxl_region_params *p, + const struct range *range) +{ + if (!p->res) + return false; + + /* + * The extended linear cache region is constructed by a 1:1 ratio + * where the SPA maps equal amounts of DRAM and CXL HPA capacity with + * CXL decoders at the high end of the SPA range. + */ + return p->res->start + p->cache_size == range->start && + p->res->end == range->end; +} + +static int match_auto_decoder(struct device *dev, const void *data) +{ + const struct cxl_region_params *p = data; + struct cxl_decoder *cxld; + struct range *r; + + if (!is_switch_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + r = &cxld->hpa_range; + + if (spa_maps_hpa(p, r)) + return 1; + + return 0; +} + +/** + * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region + * @port: a port in the ancestry of the endpoint implied by @cxled + * @cxled: endpoint decoder to be, or currently, mapped by @port + * @cxlr: region to establish, or validate, decode @port + * + * In the region creation path cxl_port_pick_region_decoder() is an + * allocator to find a free port. In the region assembly path, it is + * recalling the decoder that platform firmware picked for validation + * purposes. + * + * The result is recorded in a 'struct cxl_region_ref' in @port. + */ +static struct cxl_decoder * +cxl_port_pick_region_decoder(struct cxl_port *port, + struct cxl_endpoint_decoder *cxled, + struct cxl_region *cxlr) +{ + struct device *dev; + + if (port == cxled_to_port(cxled)) + return &cxled->cxld; + + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + dev = device_find_child(&port->dev, &cxlr->params, + match_auto_decoder); + else + dev = device_find_child(&port->dev, NULL, match_free_decoder); + if (!dev) + return NULL; + /* + * This decoder is pinned registered as long as the endpoint decoder is + * registered, and endpoint decoder unregistration holds the + * cxl_rwsem.region over unregister events, so no need to hold on to + * this extra reference. + */ + put_device(dev); + return to_cxl_decoder(dev); +} + +static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter, + struct cxl_decoder *cxld) +{ + struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter); + struct cxl_decoder *cxld_iter = rr->decoder; + + /* + * Allow the out of order assembly of auto-discovered regions. + * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders + * in HPA order. Confirm that the decoder with the lesser HPA + * starting address has the lesser id. + */ + dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n", + dev_name(&cxld->dev), cxld->id, + dev_name(&cxld_iter->dev), cxld_iter->id); + + if (cxld_iter->id > cxld->id) + return true; + + return false; +} + +static struct cxl_region_ref * +alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + struct cxl_decoder *cxld) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_region_ref *cxl_rr, *iter; + unsigned long index; + int rc; + + xa_for_each(&port->regions, index, iter) { + struct cxl_region_params *ip = &iter->region->params; + + if (!ip->res || ip->res->start < p->res->start) + continue; + + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { + if (auto_order_ok(port, iter->region, cxld)) + continue; + } + dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n", + dev_name(&port->dev), + dev_name(&iter->region->dev), ip->res, p->res); + + return ERR_PTR(-EBUSY); + } + + cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL); + if (!cxl_rr) + return ERR_PTR(-ENOMEM); + cxl_rr->port = port; + cxl_rr->region = cxlr; + cxl_rr->nr_targets = 1; + xa_init(&cxl_rr->endpoints); + + rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track region reference: %d\n", + dev_name(&port->dev), rc); + kfree(cxl_rr); + return ERR_PTR(rc); + } + + return cxl_rr; +} + +static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr) +{ + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + + if (!cxld) + return; + + dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n"); + if (cxld->region == cxlr) { + cxld->region = NULL; + put_device(&cxlr->dev); + } +} + +static void free_region_ref(struct cxl_region_ref *cxl_rr) +{ + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + + cxl_rr_free_decoder(cxl_rr); + xa_erase(&port->regions, (unsigned long)cxlr); + xa_destroy(&cxl_rr->endpoints); + kfree(cxl_rr); +} + +static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr, + struct cxl_endpoint_decoder *cxled) +{ + int rc; + struct cxl_port *port = cxl_rr->port; + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled)); + + if (ep) { + rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep, + GFP_KERNEL); + if (rc) + return rc; + } + cxl_rr->nr_eps++; + + if (!cxld->region) { + cxld->region = cxlr; + get_device(&cxlr->dev); + } + + return 0; +} + +static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + struct cxl_region_ref *cxl_rr, + struct cxl_decoder *cxld) +{ + if (cxld->region) { + dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n", + dev_name(&port->dev), dev_name(&cxld->dev), + dev_name(&cxld->region->dev)); + return -EBUSY; + } + + /* + * Endpoints should already match the region type, but backstop that + * assumption with an assertion. Switch-decoders change mapping-type + * based on what is mapped when they are assigned to a region. + */ + dev_WARN_ONCE(&cxlr->dev, + port == cxled_to_port(cxled) && + cxld->target_type != cxlr->type, + "%s:%s mismatch decoder type %d -> %d\n", + dev_name(&cxled_to_memdev(cxled)->dev), + dev_name(&cxld->dev), cxld->target_type, cxlr->type); + cxld->target_type = cxlr->type; + cxl_rr->decoder = cxld; + return 0; +} + +static void cxl_region_set_lock(struct cxl_region *cxlr, + struct cxl_decoder *cxld) +{ + if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) + return; + + set_bit(CXL_REGION_F_LOCK, &cxlr->flags); + clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); +} + +/** + * cxl_port_attach_region() - track a region's interest in a port by endpoint + * @port: port to add a new region reference 'struct cxl_region_ref' + * @cxlr: region to attach to @port + * @cxled: endpoint decoder used to create or further pin a region reference + * @pos: interleave position of @cxled in @cxlr + * + * The attach event is an opportunity to validate CXL decode setup + * constraints and record metadata needed for programming HDM decoders, + * in particular decoder target lists. + * + * The steps are: + * + * - validate that there are no other regions with a higher HPA already + * associated with @port + * - establish a region reference if one is not already present + * + * - additionally allocate a decoder instance that will host @cxlr on + * @port + * + * - pin the region reference by the endpoint + * - account for how many entries in @port's target list are needed to + * cover all of the added endpoints. + */ +static int cxl_port_attach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_ref *cxl_rr; + bool nr_targets_inc = false; + struct cxl_decoder *cxld; + unsigned long index; + int rc = -EBUSY; + + lockdep_assert_held_write(&cxl_rwsem.region); + + cxl_rr = cxl_rr_load(port, cxlr); + if (cxl_rr) { + struct cxl_ep *ep_iter; + int found = 0; + + /* + * Walk the existing endpoints that have been attached to + * @cxlr at @port and see if they share the same 'next' port + * in the downstream direction. I.e. endpoints that share common + * upstream switch. + */ + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter == ep) + continue; + if (ep_iter->next == ep->next) { + found++; + break; + } + } + + /* + * New target port, or @port is an endpoint port that always + * accounts its own local decode as a target. + */ + if (!found || !ep->next) { + cxl_rr->nr_targets++; + nr_targets_inc = true; + } + } else { + struct cxl_decoder *cxld; + + cxld = cxl_port_pick_region_decoder(port, cxled, cxlr); + if (!cxld) { + dev_dbg(&cxlr->dev, "%s: no decoder available\n", + dev_name(&port->dev)); + return -EBUSY; + } + + cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld); + if (IS_ERR(cxl_rr)) { + dev_dbg(&cxlr->dev, + "%s: failed to allocate region reference\n", + dev_name(&port->dev)); + return PTR_ERR(cxl_rr); + } + nr_targets_inc = true; + + rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld); + if (rc) + goto out_erase; + } + cxld = cxl_rr->decoder; + + /* + * the number of targets should not exceed the target_count + * of the decoder + */ + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd; + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + cxlsd->nr_targets); + rc = -ENXIO; + goto out_erase; + } + } + + cxl_region_set_lock(cxlr, cxld); + + rc = cxl_rr_ep_add(cxl_rr, cxled); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: failed to track endpoint %s:%s reference\n", + dev_name(&port->dev), dev_name(&cxlmd->dev), + dev_name(&cxld->dev)); + goto out_erase; + } + + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + ep ? ep->next ? dev_name(ep->next->uport_dev) : + dev_name(&cxlmd->dev) : + "none", + cxl_rr->nr_eps, cxl_rr->nr_targets); + + return 0; +out_erase: + if (nr_targets_inc) + cxl_rr->nr_targets--; + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); + return rc; +} + +static void cxl_port_detach_region(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_region_ref *cxl_rr; + struct cxl_ep *ep = NULL; + + lockdep_assert_held_write(&cxl_rwsem.region); + + cxl_rr = cxl_rr_load(port, cxlr); + if (!cxl_rr) + return; + + /* + * Endpoint ports do not carry cxl_ep references, and they + * never target more than one endpoint by definition + */ + if (cxl_rr->decoder == &cxled->cxld) + cxl_rr->nr_eps--; + else + ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled); + if (ep) { + struct cxl_ep *ep_iter; + unsigned long index; + int found = 0; + + cxl_rr->nr_eps--; + xa_for_each(&cxl_rr->endpoints, index, ep_iter) { + if (ep_iter->next == ep->next) { + found++; + break; + } + } + if (!found) + cxl_rr->nr_targets--; + } + + if (cxl_rr->nr_eps == 0) + free_region_ref(cxl_rr); +} + +static int check_last_peer(struct cxl_endpoint_decoder *cxled, + struct cxl_ep *ep, struct cxl_region_ref *cxl_rr, + int distance) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region *cxlr = cxl_rr->region; + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled_peer; + struct cxl_port *port = cxl_rr->port; + struct cxl_memdev *cxlmd_peer; + struct cxl_ep *ep_peer; + int pos = cxled->pos; + + /* + * If this position wants to share a dport with the last endpoint mapped + * then that endpoint, at index 'position - distance', must also be + * mapped by this dport. + */ + if (pos < distance) { + dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + cxled_peer = p->targets[pos - distance]; + cxlmd_peer = cxled_to_memdev(cxled_peer); + ep_peer = cxl_ep_load(port, cxlmd_peer); + if (ep->dport != ep_peer->dport) { + dev_dbg(&cxlr->dev, + "%s:%s: %s:%s pos %d mismatched peer %s:%s\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos, + dev_name(&cxlmd_peer->dev), + dev_name(&cxled_peer->cxld.dev)); + return -ENXIO; + } + + return 0; +} + +static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + unsigned int interleave_mask; + u8 eiw; + u16 eig; + int high_pos, low_pos; + + if (!test_bit(iw, &cxlhdm->iw_cap_mask)) + return -ENXIO; + /* + * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection), + * if eiw < 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw] + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + * + * if eiw >= 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3 + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + */ + ways_to_eiw(iw, &eiw); + if (eiw == 0 || eiw == 8) + return 0; + + granularity_to_eig(ig, &eig); + if (eiw > 8) + high_pos = eiw + eig - 1; + else + high_pos = eiw + eig + 7; + low_pos = eig + 8; + interleave_mask = GENMASK(high_pos, low_pos); + if (interleave_mask & ~cxlhdm->interleave_mask) + return -ENXIO; + + return 0; +} + +static int cxl_port_setup_targets(struct cxl_port *port, + struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; + struct cxl_port *parent_port = to_cxl_port(port->dev.parent); + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_ep *ep = cxl_ep_load(port, cxlmd); + struct cxl_region_params *p = &cxlr->params; + struct cxl_decoder *cxld = cxl_rr->decoder; + struct cxl_switch_decoder *cxlsd; + struct cxl_port *iter = port; + u16 eig, peig; + u8 eiw, peiw; + + /* + * While root level decoders support x3, x6, x12, switch level + * decoders only support powers of 2 up to x16. + */ + if (!is_power_of_2(cxl_rr->nr_targets)) { + dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + cxl_rr->nr_targets); + return -EINVAL; + } + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets_set) { + int i, distance = 1; + struct cxl_region_ref *cxl_rr_iter; + + /* + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. + */ + do { + cxl_rr_iter = cxl_rr_load(iter, cxlr); + distance *= cxl_rr_iter->nr_targets; + iter = to_cxl_port(iter->dev.parent); + } while (!is_cxl_root(iter)); + distance *= cxlrd->cxlsd.cxld.interleave_ways; + + for (i = 0; i < cxl_rr->nr_targets_set; i++) + if (ep->dport == cxlsd->target[i]) { + rc = check_last_peer(cxled, ep, cxl_rr, + distance); + if (rc) + return rc; + goto out_target_set; + } + goto add_target; + } + + if (is_cxl_root(parent_port)) { + /* + * Root decoder IG is always set to value in CFMWS which + * may be different than this region's IG. We can use the + * region's IG here since interleave_granularity_store() + * does not allow interleaved host-bridges with + * root IG != region IG. + */ + parent_ig = p->interleave_granularity; + parent_iw = cxlrd->cxlsd.cxld.interleave_ways; + /* + * For purposes of address bit routing, use power-of-2 math for + * switch ports. + */ + if (!is_power_of_2(parent_iw)) + parent_iw /= 3; + } else { + struct cxl_region_ref *parent_rr; + struct cxl_decoder *parent_cxld; + + parent_rr = cxl_rr_load(parent_port, cxlr); + parent_cxld = parent_rr->decoder; + parent_ig = parent_cxld->interleave_granularity; + parent_iw = parent_cxld->interleave_ways; + } + + rc = granularity_to_eig(parent_ig, &peig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n", + dev_name(parent_port->uport_dev), + dev_name(&parent_port->dev), parent_ig); + return rc; + } + + rc = ways_to_eiw(parent_iw, &peiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n", + dev_name(parent_port->uport_dev), + dev_name(&parent_port->dev), parent_iw); + return rc; + } + + iw = cxl_rr->nr_targets; + rc = ways_to_eiw(iw, &eiw); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), iw); + return rc; + } + + /* + * Interleave granularity is a multiple of @parent_port granularity. + * Multiplier is the parent port interleave ways. + */ + rc = granularity_to_eig(parent_ig * parent_iw, &eig); + if (rc) { + dev_dbg(&cxlr->dev, + "%s: invalid granularity calculation (%d * %d)\n", + dev_name(&parent_port->dev), parent_ig, parent_iw); + return rc; + } + + rc = eig_to_granularity(eig, &ig); + if (rc) { + dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + 256 << eig); + return rc; + } + + if (iw > 8 || iw > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s:%s: ways: %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), iw, cxlsd->nr_targets); + return -ENXIO; + } + + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { + if (cxld->interleave_ways != iw || + (iw > 1 && cxld->interleave_granularity != ig) || + !spa_maps_hpa(p, &cxld->hpa_range) || + ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { + dev_err(&cxlr->dev, + "%s:%s %s expected iw: %d ig: %d %pr\n", + dev_name(port->uport_dev), dev_name(&port->dev), + __func__, iw, ig, p->res); + dev_err(&cxlr->dev, + "%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n", + dev_name(port->uport_dev), dev_name(&port->dev), + __func__, cxld->interleave_ways, + cxld->interleave_granularity, + str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE), + cxld->hpa_range.start, cxld->hpa_range.end); + return -ENXIO; + } + } else { + rc = check_interleave_cap(cxld, iw, ig); + if (rc) { + dev_dbg(&cxlr->dev, + "%s:%s iw: %d ig: %d is not supported\n", + dev_name(port->uport_dev), + dev_name(&port->dev), iw, ig); + return rc; + } + + cxld->interleave_ways = iw; + cxld->interleave_granularity = ig; + cxld->hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + } + dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev), + dev_name(&port->dev), iw, ig); +add_target: + if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s: targets full trying to add %s:%s at %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + return -ENXIO; + } + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { + if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) { + dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxlsd->cxld.dev), + dev_name(ep->dport->dport_dev), + cxl_rr->nr_targets_set); + return -ENXIO; + } + } else { + cxlsd->target[cxl_rr->nr_targets_set] = ep->dport; + cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id; + } + cxl_rr->nr_targets_set++; +out_target_set: + dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev), + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos); + + return 0; +} + +static void cxl_port_reset_targets(struct cxl_port *port, + struct cxl_region *cxlr) +{ + struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); + struct cxl_decoder *cxld; + + /* + * After the last endpoint has been detached the entire cxl_rr may now + * be gone. + */ + if (!cxl_rr) + return; + cxl_rr->nr_targets_set = 0; + + cxld = cxl_rr->decoder; + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; +} + +static void cxl_region_teardown_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_dev_state *cxlds; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + int i; + + /* + * In the auto-discovery case skip automatic teardown since the + * address space is already active + */ + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) + return; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + + if (cxlds->rcd) + continue; + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) + cxl_port_reset_targets(iter, cxlr); + } +} + +static int cxl_region_setup_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + struct cxl_dev_state *cxlds; + int i, rc, rch = 0, vh = 0; + struct cxl_memdev *cxlmd; + struct cxl_port *iter; + struct cxl_ep *ep; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + + /* validate that all targets agree on topology */ + if (!cxlds->rcd) { + vh++; + } else { + rch++; + continue; + } + + iter = cxled_to_port(cxled); + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + /* + * Descend the topology tree programming / validating + * targets while looking for conflicts. + */ + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + rc = cxl_port_setup_targets(iter, cxlr, cxled); + if (rc) { + cxl_region_teardown_targets(cxlr); + return rc; + } + } + } + + if (rch && vh) { + dev_err(&cxlr->dev, "mismatched CXL topologies detected\n"); + cxl_region_teardown_targets(cxlr); + return -ENXIO; + } + + return 0; +} + +static int cxl_region_validate_position(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_params *p = &cxlr->params; + int i; + + if (pos < 0 || pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + return -ENXIO; + } + + if (p->targets[pos] == cxled) + return 0; + + if (p->targets[pos]) { + struct cxl_endpoint_decoder *cxled_target = p->targets[pos]; + struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target); + + dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n", + pos, dev_name(&cxlmd_target->dev), + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + + for (i = 0; i < p->interleave_ways; i++) { + struct cxl_endpoint_decoder *cxled_target; + struct cxl_memdev *cxlmd_target; + + cxled_target = p->targets[i]; + if (!cxled_target) + continue; + + cxlmd_target = cxled_to_memdev(cxled_target); + if (cxlmd_target == cxlmd) { + dev_dbg(&cxlr->dev, + "%s already specified at position %d via: %s\n", + dev_name(&cxlmd->dev), pos, + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + } + + return 0; +} + +static int cxl_region_attach_position(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled, + const struct cxl_dport *dport, int pos) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw = cxld->interleave_ways; + struct cxl_port *iter; + int rc; + + if (dport != cxlrd->cxlsd.target[pos % iw]) { + dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(&cxlrd->cxlsd.cxld.dev)); + return -ENXIO; + } + + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + rc = cxl_port_attach_region(iter, cxlr, cxled, pos); + if (rc) + goto err; + } + + return 0; + +err: + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + return rc; +} + +static int cxl_region_attach_auto(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + + if (cxled->state != CXL_DECODER_STATE_AUTO) { + dev_err(&cxlr->dev, + "%s: unable to add decoder to autodetected region\n", + dev_name(&cxled->cxld.dev)); + return -EINVAL; + } + + if (pos >= 0) { + dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n", + dev_name(&cxled->cxld.dev), pos); + return -EINVAL; + } + + if (p->nr_targets >= p->interleave_ways) { + dev_err(&cxlr->dev, "%s: no more target slots available\n", + dev_name(&cxled->cxld.dev)); + return -ENXIO; + } + + /* + * Temporarily record the endpoint decoder into the target array. Yes, + * this means that userspace can view devices in the wrong position + * before the region activates, and must be careful to understand when + * it might be racing region autodiscovery. + */ + pos = p->nr_targets; + p->targets[pos] = cxled; + cxled->pos = pos; + p->nr_targets++; + + return 0; +} + +static int cmp_interleave_pos(const void *a, const void *b) +{ + struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a; + struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b; + + return cxled_a->pos - cxled_b->pos; +} + +static int match_switch_decoder_by_range(struct device *dev, + const void *data) +{ + struct cxl_switch_decoder *cxlsd; + const struct range *r1, *r2 = data; + + + if (!is_switch_decoder(dev)) + return 0; + + cxlsd = to_cxl_switch_decoder(dev); + r1 = &cxlsd->cxld.hpa_range; + + if (is_root_decoder(dev)) + return range_contains(r1, r2); + return (r1->start == r2->start && r1->end == r2->end); +} + +static int find_pos_and_ways(struct cxl_port *port, struct range *range, + int *pos, int *ways) +{ + struct cxl_switch_decoder *cxlsd; + struct cxl_port *parent; + struct device *dev; + int rc = -ENXIO; + + parent = parent_port_of(port); + if (!parent) + return rc; + + dev = device_find_child(&parent->dev, range, + match_switch_decoder_by_range); + if (!dev) { + dev_err(port->uport_dev, + "failed to find decoder mapping %#llx-%#llx\n", + range->start, range->end); + return rc; + } + cxlsd = to_cxl_switch_decoder(dev); + *ways = cxlsd->cxld.interleave_ways; + + for (int i = 0; i < *ways; i++) { + if (cxlsd->target[i] == port->parent_dport) { + *pos = i; + rc = 0; + break; + } + } + put_device(dev); + + if (rc) + dev_err(port->uport_dev, + "failed to find %s:%s in target list of %s\n", + dev_name(&port->dev), + dev_name(port->parent_dport->dport_dev), + dev_name(&cxlsd->cxld.dev)); + + return rc; +} + +/** + * cxl_calc_interleave_pos() - calculate an endpoint position in a region + * @cxled: endpoint decoder member of given region + * + * The endpoint position is calculated by traversing the topology from + * the endpoint to the root decoder and iteratively applying this + * calculation: + * + * position = position * parent_ways + parent_pos; + * + * ...where @position is inferred from switch and root decoder target lists. + * + * Return: position >= 0 on success + * -ENXIO on failure + */ +static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *iter, *port = cxled_to_port(cxled); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct range *range = &cxled->cxld.hpa_range; + int parent_ways = 0, parent_pos = 0, pos = 0; + int rc; + + /* + * Example: the expected interleave order of the 4-way region shown + * below is: mem0, mem2, mem1, mem3 + * + * root_port + * / \ + * host_bridge_0 host_bridge_1 + * | | | | + * mem0 mem1 mem2 mem3 + * + * In the example the calculator will iterate twice. The first iteration + * uses the mem position in the host-bridge and the ways of the host- + * bridge to generate the first, or local, position. The second + * iteration uses the host-bridge position in the root_port and the ways + * of the root_port to refine the position. + * + * A trace of the calculation per endpoint looks like this: + * mem0: pos = 0 * 2 + 0 mem2: pos = 0 * 2 + 0 + * pos = 0 * 2 + 0 pos = 0 * 2 + 1 + * pos: 0 pos: 1 + * + * mem1: pos = 0 * 2 + 1 mem3: pos = 0 * 2 + 1 + * pos = 1 * 2 + 0 pos = 1 * 2 + 1 + * pos: 2 pos = 3 + * + * Note that while this example is simple, the method applies to more + * complex topologies, including those with switches. + */ + + /* Iterate from endpoint to root_port refining the position */ + for (iter = port; iter; iter = parent_port_of(iter)) { + if (is_cxl_root(iter)) + break; + + rc = find_pos_and_ways(iter, range, &parent_pos, &parent_ways); + if (rc) + return rc; + + pos = pos * parent_ways + parent_pos; + } + + dev_dbg(&cxlmd->dev, + "decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n", + dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent), + dev_name(&port->dev), range->start, range->end, pos); + + return pos; +} + +static int cxl_region_sort_targets(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i, rc = 0; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + + cxled->pos = cxl_calc_interleave_pos(cxled); + /* + * Record that sorting failed, but still continue to calc + * cxled->pos so that follow-on code paths can reliably + * do p->targets[cxled->pos] to self-reference their entry. + */ + if (cxled->pos < 0) + rc = -ENXIO; + } + /* Keep the cxlr target list in interleave position order */ + sort(p->targets, p->nr_targets, sizeof(p->targets[0]), + cmp_interleave_pos, NULL); + + dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful"); + return rc; +} + +static int cxl_region_attach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_region_params *p = &cxlr->params; + struct cxl_port *ep_port, *root_port; + struct cxl_dport *dport; + int rc = -ENXIO; + + rc = check_interleave_cap(&cxled->cxld, p->interleave_ways, + p->interleave_granularity); + if (rc) { + dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n", + dev_name(&cxled->cxld.dev), p->interleave_ways, + p->interleave_granularity); + return rc; + } + + if (cxled->part < 0) { + dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); + return -ENODEV; + } + + if (cxlds->part[cxled->part].mode != cxlr->mode) { + dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n", + dev_name(&cxled->cxld.dev), cxlr->mode); + return -EINVAL; + } + + /* all full of members, or interleave config not established? */ + if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "region already active\n"); + return -EBUSY; + } + + if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_dbg(&cxlr->dev, "interleave config missing\n"); + return -ENXIO; + } + + if (p->nr_targets >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "region already has %d endpoints\n", + p->nr_targets); + return -EINVAL; + } + + ep_port = cxled_to_port(cxled); + root_port = cxlrd_to_port(cxlrd); + dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge); + if (!dport) { + dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + dev_name(cxlr->dev.parent)); + return -ENXIO; + } + + if (cxled->cxld.target_type != cxlr->type) { + dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->cxld.target_type, cxlr->type); + return -ENXIO; + } + + if (!cxled->dpa_res) { + dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev)); + return -ENXIO; + } + + if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size != + resource_size(p->res)) { + dev_dbg(&cxlr->dev, + "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + (u64)resource_size(cxled->dpa_res), p->interleave_ways, + (u64)p->cache_size, (u64)resource_size(p->res)); + return -EINVAL; + } + + cxl_region_perf_data_calculate(cxlr, cxled); + + if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { + int i; + + rc = cxl_region_attach_auto(cxlr, cxled, pos); + if (rc) + return rc; + + /* await more targets to arrive... */ + if (p->nr_targets < p->interleave_ways) + return 0; + + /* + * All targets are here, which implies all PCI enumeration that + * affects this region has been completed. Walk the topology to + * sort the devices into their relative region decode position. + */ + rc = cxl_region_sort_targets(cxlr); + if (rc) + return rc; + + for (i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + ep_port = cxled_to_port(cxled); + dport = cxl_find_dport_by_dev(root_port, + ep_port->host_bridge); + rc = cxl_region_attach_position(cxlr, cxlrd, cxled, + dport, i); + if (rc) + return rc; + } + + rc = cxl_region_setup_targets(cxlr); + if (rc) + return rc; + + /* + * If target setup succeeds in the autodiscovery case + * then the region is already committed. + */ + p->state = CXL_CONFIG_COMMIT; + cxl_region_shared_upstream_bandwidth_update(cxlr); + + return 0; + } + + rc = cxl_region_validate_position(cxlr, cxled, pos); + if (rc) + return rc; + + rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos); + if (rc) + return rc; + + p->targets[pos] = cxled; + cxled->pos = pos; + p->nr_targets++; + + if (p->nr_targets == p->interleave_ways) { + rc = cxl_region_setup_targets(cxlr); + if (rc) + return rc; + p->state = CXL_CONFIG_ACTIVE; + cxl_region_shared_upstream_bandwidth_update(cxlr); + } + + cxled->cxld.interleave_ways = p->interleave_ways; + cxled->cxld.interleave_granularity = p->interleave_granularity; + cxled->cxld.hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + + if (p->nr_targets != p->interleave_ways) + return 0; + + /* + * Test the auto-discovery position calculator function + * against this successfully created user-defined region. + * A fail message here means that this interleave config + * will fail when presented as CXL_REGION_F_AUTO. + */ + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + int test_pos; + + test_pos = cxl_calc_interleave_pos(cxled); + dev_dbg(&cxled->cxld.dev, + "Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n", + (test_pos == cxled->pos) ? "success" : "fail", + test_pos, cxled->pos); + } + + return 0; +} + +static struct cxl_region * +__cxl_decoder_detach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos, + enum cxl_detach_mode mode) +{ + struct cxl_region_params *p; + + lockdep_assert_held_write(&cxl_rwsem.region); + + if (!cxled) { + p = &cxlr->params; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", + pos, p->interleave_ways); + return NULL; + } + + if (!p->targets[pos]) + return NULL; + cxled = p->targets[pos]; + } else { + cxlr = cxled->cxld.region; + if (!cxlr) + return NULL; + p = &cxlr->params; + } + + if (mode == DETACH_INVALIDATE) + cxled->part = -1; + + if (p->state > CXL_CONFIG_ACTIVE) { + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; + } + + for (struct cxl_port *iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) + cxl_port_detach_region(iter, cxlr, cxled); + + if (cxled->pos < 0 || cxled->pos >= p->interleave_ways || + p->targets[cxled->pos] != cxled) { + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->pos); + return NULL; + } + + if (p->state == CXL_CONFIG_ACTIVE) { + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + cxl_region_teardown_targets(cxlr); + } + p->targets[cxled->pos] = NULL; + p->nr_targets--; + cxled->cxld.hpa_range = (struct range) { + .start = 0, + .end = -1, + }; + + get_device(&cxlr->dev); + return cxlr; +} + +/* + * Cleanup a decoder's interest in a region. There are 2 cases to + * handle, removing an unknown @cxled from a known position in a region + * (detach_target()) or removing a known @cxled from an unknown @cxlr + * (cxld_unregister()) + * + * When the detachment finds a region release the region driver. + */ +int cxl_decoder_detach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos, + enum cxl_detach_mode mode) +{ + struct cxl_region *detach; + + /* when the decoder is being destroyed lock unconditionally */ + if (mode == DETACH_INVALIDATE) { + guard(rwsem_write)(&cxl_rwsem.region); + detach = __cxl_decoder_detach(cxlr, cxled, pos, mode); + } else { + int rc; + + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + detach = __cxl_decoder_detach(cxlr, cxled, pos, mode); + } + + if (detach) { + device_release_driver(&detach->dev); + put_device(&detach->dev); + } + return 0; +} + +static int __attach_target(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos, + unsigned int state) +{ + int rc; + + if (state == TASK_INTERRUPTIBLE) { + ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem))) + return rc; + guard(rwsem_read)(&cxl_rwsem.dpa); + return cxl_region_attach(cxlr, cxled, pos); + } + guard(rwsem_write)(&cxl_rwsem.region); + guard(rwsem_read)(&cxl_rwsem.dpa); + return cxl_region_attach(cxlr, cxled, pos); +} + +static int attach_target(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos, + unsigned int state) +{ + int rc = __attach_target(cxlr, cxled, pos, state); + + if (rc == 0) + return 0; + + dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n", + dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc); + return rc; +} + +static int detach_target(struct cxl_region *cxlr, int pos) +{ + return cxl_decoder_detach(cxlr, NULL, pos, DETACH_ONLY); +} + +static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos, + size_t len) +{ + int rc; + + if (sysfs_streq(buf, "\n")) + rc = detach_target(cxlr, pos); + else { + struct device *dev; + + dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf); + if (!dev) + return -ENODEV; + + if (!is_endpoint_decoder(dev)) { + rc = -EINVAL; + goto out; + } + + rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos, + TASK_INTERRUPTIBLE); +out: + put_device(dev); + } + + if (rc < 0) + return rc; + return len; +} + +#define TARGET_ATTR_RW(n) \ +static ssize_t target##n##_show( \ + struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + return show_targetN(to_cxl_region(dev), buf, (n)); \ +} \ +static ssize_t target##n##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return store_targetN(to_cxl_region(dev), buf, (n), len); \ +} \ +static DEVICE_ATTR_RW(target##n) + +TARGET_ATTR_RW(0); +TARGET_ATTR_RW(1); +TARGET_ATTR_RW(2); +TARGET_ATTR_RW(3); +TARGET_ATTR_RW(4); +TARGET_ATTR_RW(5); +TARGET_ATTR_RW(6); +TARGET_ATTR_RW(7); +TARGET_ATTR_RW(8); +TARGET_ATTR_RW(9); +TARGET_ATTR_RW(10); +TARGET_ATTR_RW(11); +TARGET_ATTR_RW(12); +TARGET_ATTR_RW(13); +TARGET_ATTR_RW(14); +TARGET_ATTR_RW(15); + +static struct attribute *target_attrs[] = { + &dev_attr_target0.attr, + &dev_attr_target1.attr, + &dev_attr_target2.attr, + &dev_attr_target3.attr, + &dev_attr_target4.attr, + &dev_attr_target5.attr, + &dev_attr_target6.attr, + &dev_attr_target7.attr, + &dev_attr_target8.attr, + &dev_attr_target9.attr, + &dev_attr_target10.attr, + &dev_attr_target11.attr, + &dev_attr_target12.attr, + &dev_attr_target13.attr, + &dev_attr_target14.attr, + &dev_attr_target15.attr, + NULL, +}; + +static umode_t cxl_region_target_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + + if (n < p->interleave_ways) + return a->mode; + return 0; +} + +static const struct attribute_group cxl_region_target_group = { + .attrs = target_attrs, + .is_visible = cxl_region_target_visible, +}; + +static const struct attribute_group *get_cxl_region_target_group(void) +{ + return &cxl_region_target_group; +} + +static const struct attribute_group *region_groups[] = { + &cxl_base_attribute_group, + &cxl_region_group, + &cxl_region_target_group, + &cxl_region_access0_coordinate_group, + &cxl_region_access1_coordinate_group, + NULL, +}; + +static void cxl_region_release(struct device *dev) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_region *cxlr = to_cxl_region(dev); + int id = atomic_read(&cxlrd->region_id); + + /* + * Try to reuse the recently idled id rather than the cached + * next id to prevent the region id space from increasing + * unnecessarily. + */ + if (cxlr->id < id) + if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) { + memregion_free(id); + goto out; + } + + memregion_free(cxlr->id); +out: + put_device(dev->parent); + kfree(cxlr); +} + +const struct device_type cxl_region_type = { + .name = "cxl_region", + .release = cxl_region_release, + .groups = region_groups +}; + +bool is_cxl_region(struct device *dev) +{ + return dev->type == &cxl_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL"); + +static struct cxl_region *to_cxl_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type, + "not a cxl_region device\n")) + return NULL; + + return container_of(dev, struct cxl_region, dev); +} + +static void unregister_region(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + struct cxl_region_params *p = &cxlr->params; + int i; + + device_del(&cxlr->dev); + + /* + * Now that region sysfs is shutdown, the parameter block is now + * read-only, so no need to hold the region rwsem to access the + * region parameters. + */ + for (i = 0; i < p->interleave_ways; i++) + detach_target(cxlr, i); + + cxl_region_iomem_release(cxlr); + put_device(&cxlr->dev); +} + +static struct lock_class_key cxl_region_key; + +static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) +{ + struct cxl_region *cxlr; + struct device *dev; + + cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL); + if (!cxlr) { + memregion_free(id); + return ERR_PTR(-ENOMEM); + } + + dev = &cxlr->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_region_key); + dev->parent = &cxlrd->cxlsd.cxld.dev; + /* + * Keep root decoder pinned through cxl_region_release to fixup + * region id allocations + */ + get_device(dev->parent); + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_region_type; + cxlr->id = id; + cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld); + + return cxlr; +} + +static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) +{ + int cset = 0; + int rc; + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (cxlr->coord[i].read_bandwidth) { + node_update_perf_attrs(nid, &cxlr->coord[i], i); + cset++; + } + } + + if (!cset) + return false; + + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access0_group()); + if (rc) + dev_dbg(&cxlr->dev, "Failed to update access0 group\n"); + + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access1_group()); + if (rc) + dev_dbg(&cxlr->dev, "Failed to update access1 group\n"); + + return true; +} + +static int cxl_region_perf_attrs_callback(struct notifier_block *nb, + unsigned long action, void *arg) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + node_notifier); + struct node_notify *nn = arg; + int nid = nn->nid; + int region_nid; + + if (action != NODE_ADDED_FIRST_MEMORY) + return NOTIFY_DONE; + + /* + * No need to hold cxl_rwsem.region; region parameters are stable + * within the cxl_region driver. + */ + region_nid = phys_to_target_node(cxlr->params.res->start); + if (nid != region_nid) + return NOTIFY_DONE; + + /* No action needed if node bit already set */ + if (node_test_and_set(nid, nodemask_region_seen)) + return NOTIFY_DONE; + + if (!cxl_region_update_coordinates(cxlr, nid)) + return NOTIFY_DONE; + + return NOTIFY_OK; +} + +static int cxl_region_calculate_adistance(struct notifier_block *nb, + unsigned long nid, void *data) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + adist_notifier); + struct access_coordinate *perf; + int *adist = data; + int region_nid; + + /* + * No need to hold cxl_rwsem.region; region parameters are stable + * within the cxl_region driver. + */ + region_nid = phys_to_target_node(cxlr->params.res->start); + if (nid != region_nid) + return NOTIFY_OK; + + perf = &cxlr->coord[ACCESS_COORDINATE_CPU]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + +/** + * devm_cxl_add_region - Adds a region to a decoder + * @cxlrd: root decoder + * @id: memregion id to create, or memregion_free() on failure + * @mode: mode for the endpoint decoders of this region + * @type: select whether this is an expander or accelerator (type-2 or type-3) + * + * This is the second step of region initialization. Regions exist within an + * address space which is mapped by a @cxlrd. + * + * Return: 0 if the region was added to the @cxlrd, else returns negative error + * code. The region will be named "regionZ" where Z is the unique region number. + */ +static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, + int id, + enum cxl_partition_mode mode, + enum cxl_decoder_type type) +{ + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); + struct cxl_region *cxlr; + struct device *dev; + int rc; + + cxlr = cxl_region_alloc(cxlrd, id); + if (IS_ERR(cxlr)) + return cxlr; + cxlr->mode = mode; + cxlr->type = type; + + dev = &cxlr->dev; + rc = dev_set_name(dev, "region%d", id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + + dev_dbg(port->uport_dev, "%s: created %s\n", + dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); + return cxlr; + +err: + put_device(dev); + return ERR_PTR(rc); +} + +static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf) +{ + return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id)); +} + +static ssize_t create_pmem_region_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __create_region_show(to_cxl_root_decoder(dev), buf); +} + +static ssize_t create_ram_region_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __create_region_show(to_cxl_root_decoder(dev), buf); +} + +static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, + enum cxl_partition_mode mode, int id) +{ + int rc; + + switch (mode) { + case CXL_PARTMODE_RAM: + case CXL_PARTMODE_PMEM: + break; + default: + dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); + return ERR_PTR(-EINVAL); + } + + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) + return ERR_PTR(rc); + + if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) { + memregion_free(rc); + return ERR_PTR(-EBUSY); + } + + return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); +} + +static ssize_t create_region_store(struct device *dev, const char *buf, + size_t len, enum cxl_partition_mode mode) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_region *cxlr; + int rc, id; + + rc = sscanf(buf, "region%d\n", &id); + if (rc != 1) + return -EINVAL; + + cxlr = __create_region(cxlrd, mode, id); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + return len; +} + +static ssize_t create_pmem_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM); +} +DEVICE_ATTR_RW(create_pmem_region); + +static ssize_t create_ram_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return create_region_store(dev, buf, len, CXL_PARTMODE_RAM); +} +DEVICE_ATTR_RW(create_ram_region); + +static ssize_t region_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + ssize_t rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) + return rc; + + if (cxld->region) + return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev)); + return sysfs_emit(buf, "\n"); +} +DEVICE_ATTR_RO(region); + +static struct cxl_region * +cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) +{ + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct device *region_dev; + + region_dev = device_find_child_by_name(&cxld->dev, name); + if (!region_dev) + return ERR_PTR(-ENODEV); + + return to_cxl_region(region_dev); +} + +static ssize_t delete_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_region *cxlr; + + cxlr = cxl_find_region_by_name(cxlrd, buf); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + devm_release_action(port->uport_dev, unregister_region, cxlr); + put_device(&cxlr->dev); + + return len; +} +DEVICE_ATTR_WO(delete_region); + +static void cxl_pmem_region_release(struct device *dev) +{ + struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev); + int i; + + for (i = 0; i < cxlr_pmem->nr_mappings; i++) { + struct cxl_memdev *cxlmd = cxlr_pmem->mapping[i].cxlmd; + + put_device(&cxlmd->dev); + } + + kfree(cxlr_pmem); +} + +static const struct attribute_group *cxl_pmem_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_pmem_region_type = { + .name = "cxl_pmem_region", + .release = cxl_pmem_region_release, + .groups = cxl_pmem_region_attribute_groups, +}; + +bool is_cxl_pmem_region(struct device *dev) +{ + return dev->type == &cxl_pmem_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL"); + +struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_pmem_region(dev), + "not a cxl_pmem_region device\n")) + return NULL; + return container_of(dev, struct cxl_pmem_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); + +struct cxl_poison_context { + struct cxl_port *port; + int part; + u64 offset; +}; + +static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, + struct cxl_poison_context *ctx) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + const struct resource *res; + struct resource *p, *last; + u64 offset, length; + int rc = 0; + + if (ctx->part < 0) + return 0; + + /* + * Collect poison for the remaining unmapped resources after + * poison is collected by committed endpoints decoders. + */ + for (int i = ctx->part; i < cxlds->nr_partitions; i++) { + res = &cxlds->part[i].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + offset = last->end + 1; + else + offset = res->start; + length = res->end - offset + 1; + if (!length) + break; + rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) + continue; + if (rc) + break; + } + + return rc; +} + +static int poison_by_decoder(struct device *dev, void *arg) +{ + struct cxl_poison_context *ctx = arg; + struct cxl_endpoint_decoder *cxled; + enum cxl_partition_mode mode; + struct cxl_dev_state *cxlds; + struct cxl_memdev *cxlmd; + u64 offset, length; + int rc = 0; + + if (!is_endpoint_decoder(dev)) + return rc; + + cxled = to_cxl_endpoint_decoder(dev); + if (!cxled->dpa_res) + return rc; + + cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + mode = cxlds->part[cxled->part].mode; + + if (cxled->skip) { + offset = cxled->dpa_res->start - cxled->skip; + length = cxled->skip; + rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) + rc = 0; + if (rc) + return rc; + } + + offset = cxled->dpa_res->start; + length = cxled->dpa_res->end - offset + 1; + rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region); + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) + rc = 0; + if (rc) + return rc; + + /* Iterate until commit_end is reached */ + if (cxled->cxld.id == ctx->port->commit_end) { + ctx->offset = cxled->dpa_res->end + 1; + ctx->part = cxled->part; + return 1; + } + + return 0; +} + +int cxl_get_poison_by_endpoint(struct cxl_port *port) +{ + struct cxl_poison_context ctx; + int rc = 0; + + ctx = (struct cxl_poison_context) { + .port = port, + .part = -1, + }; + + rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder); + if (rc == 1) + rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev), + &ctx); + + return rc; +} + +struct cxl_dpa_to_region_context { + struct cxl_region *cxlr; + u64 dpa; +}; + +static int __cxl_dpa_to_region(struct device *dev, void *arg) +{ + struct cxl_dpa_to_region_context *ctx = arg; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + u64 dpa = ctx->dpa; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res)) + return 0; + + if (!cxl_resource_contains_addr(cxled->dpa_res, dpa)) + return 0; + + /* + * Stop the region search (return 1) when an endpoint mapping is + * found. The region may not be fully constructed so offering + * the cxlr in the context structure is not guaranteed. + */ + cxlr = cxled->cxld.region; + if (cxlr) + dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa, + dev_name(&cxlr->dev)); + else + dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa, + dev_name(dev)); + + ctx->cxlr = cxlr; + + return 1; +} + +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) +{ + struct cxl_dpa_to_region_context ctx; + struct cxl_port *port; + + ctx = (struct cxl_dpa_to_region_context) { + .dpa = dpa, + }; + port = cxlmd->endpoint; + if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port)) + device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region); + + return ctx.cxlr; +} + +static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + int gran = p->interleave_granularity; + int ways = p->interleave_ways; + u64 offset; + + /* Is the hpa in an expected chunk for its pos(-ition) */ + offset = hpa - p->res->start; + offset = do_div(offset, gran * ways); + if ((offset >= pos * gran) && (offset < (pos + 1) * gran)) + return true; + + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa); + + return false; +} + +#define CXL_POS_ZERO 0 +/** + * cxl_validate_translation_params + * @eiw: encoded interleave ways + * @eig: encoded interleave granularity + * @pos: position in interleave + * + * Callers pass CXL_POS_ZERO when no position parameter needs validating. + * + * Returns: 0 on success, -EINVAL on first invalid parameter + */ +int cxl_validate_translation_params(u8 eiw, u16 eig, int pos) +{ + int ways, gran; + + if (eiw_to_ways(eiw, &ways)) { + pr_debug("%s: invalid eiw=%u\n", __func__, eiw); + return -EINVAL; + } + if (eig_to_granularity(eig, &gran)) { + pr_debug("%s: invalid eig=%u\n", __func__, eig); + return -EINVAL; + } + if (pos < 0 || pos >= ways) { + pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos, + ways); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate"); + +u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig) +{ + u64 dpa_offset, bits_lower, bits_upper, temp; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) + return ULLONG_MAX; + + /* + * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13 + * Lower bits [IG+7:0] pass through unchanged + * (eiw < 8) + * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW) + * Clear the position bits to isolate upper section, then + * reverse the left shift by eiw that occurred during DPA->HPA + * (eiw >= 8) + * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3 + * Extract upper bits from the correct bit range and divide by 3 + * to recover the original DPA upper bits + */ + bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0); + if (eiw < 8) { + temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0); + dpa_offset = temp >> eiw; + } else { + bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3); + dpa_offset = bits_upper << (eig + 8); + } + dpa_offset |= bits_lower; + + return dpa_offset; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate"); + +int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig) +{ + unsigned int ways = 0; + u64 shifted, rem; + int pos, ret; + + ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO); + if (ret) + return ret; + + if (!eiw) + /* position is 0 if no interleaving */ + return 0; + + /* + * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13 + * eiw < 8 + * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8]. + * Per spec "remove IW bits starting with bit position IG+8" + * eiw >= 8 + * Position is not explicitly stored in HPA_OFFSET bits. It is + * derived from the modulo operation of the upper bits using + * the total number of interleave ways. + */ + if (eiw < 8) { + pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0); + } else { + shifted = hpa_offset >> (eig + 8); + eiw_to_ways(eiw, &ways); + div64_u64_rem(shifted, ways, &rem); + pos = rem; + } + + return pos; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate"); + +u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig) +{ + u64 mask_upper, hpa_offset, bits_upper; + int ret; + + ret = cxl_validate_translation_params(eiw, eig, pos); + if (ret) + return ULLONG_MAX; + + /* + * The device position in the region interleave set was removed + * from the offset at HPA->DPA translation. To reconstruct the + * HPA, place the 'pos' in the offset. + * + * The placement of 'pos' in the HPA is determined by interleave + * ways and granularity and is defined in the CXL Spec 3.0 Section + * 8.2.4.19.13 Implementation Note: Device Decode Logic + */ + + mask_upper = GENMASK_ULL(51, eig + 8); + + if (eiw < 8) { + hpa_offset = (dpa_offset & mask_upper) << eiw; + hpa_offset |= pos << (eig + 8); + } else { + bits_upper = (dpa_offset & mask_upper) >> (eig + 8); + bits_upper = bits_upper * 3; + hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8); + } + + /* The lower bits remain unchanged */ + hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); + + return hpa_offset; +} +EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); + +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled = NULL; + u64 dpa_offset, hpa_offset, hpa; + u16 eig = 0; + u8 eiw = 0; + int pos; + + for (int i = 0; i < p->nr_targets; i++) { + if (cxlmd == cxled_to_memdev(p->targets[i])) { + cxled = p->targets[i]; + break; + } + } + if (!cxled) + return ULLONG_MAX; + + pos = cxled->pos; + ways_to_eiw(p->interleave_ways, &eiw); + granularity_to_eig(p->interleave_granularity, &eig); + + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); + + /* Apply the hpa_offset to the region base address */ + hpa = hpa_offset + p->res->start + p->cache_size; + + /* Root decoder translation overrides typical modulo decode */ + if (cxlrd->ops.hpa_to_spa) + hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa); + + if (!cxl_resource_contains_addr(p->res, hpa)) { + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in region\n", hpa); + return ULLONG_MAX; + } + + /* Simple chunk check, by pos & gran, only applies to modulo decodes */ + if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) + return ULLONG_MAX; + + return hpa; +} + +struct dpa_result { + struct cxl_memdev *cxlmd; + u64 dpa; +}; + +static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, + struct dpa_result *result) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_endpoint_decoder *cxled; + u64 hpa, hpa_offset, dpa_offset; + u16 eig = 0; + u8 eiw = 0; + int pos; + + lockdep_assert_held(&cxl_rwsem.region); + lockdep_assert_held(&cxl_rwsem.dpa); + + /* Input validation ensures valid ways and gran */ + granularity_to_eig(p->interleave_granularity, &eig); + ways_to_eiw(p->interleave_ways, &eiw); + + /* + * If the root decoder has SPA to CXL HPA callback, use it. Otherwise + * CXL HPA is assumed to equal SPA. + */ + if (cxlrd->ops.spa_to_hpa) { + hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset); + hpa_offset = hpa - p->res->start; + } else { + hpa_offset = offset; + } + + pos = cxl_calculate_position(hpa_offset, eiw, eig); + if (pos < 0 || pos >= p->nr_targets) { + dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", + pos, p->nr_targets); + return -ENXIO; + } + + dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig); + + /* Look-up and return the result: a memdev and a DPA */ + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxled->pos != pos) + continue; + result->cxlmd = cxled_to_memdev(cxled); + result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset; + + return 0; + } + dev_err(&cxlr->dev, "No device found for position %d\n", pos); + + return -ENXIO; +} + +static struct lock_class_key cxl_pmem_region_key; + +static int cxl_pmem_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int i; + + guard(rwsem_read)(&cxl_rwsem.region); + if (p->state != CXL_CONFIG_COMMIT) + return -ENXIO; + + struct cxl_pmem_region *cxlr_pmem __free(kfree) = + kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), GFP_KERNEL); + if (!cxlr_pmem) + return -ENOMEM; + + cxlr_pmem->hpa_range.start = p->res->start; + cxlr_pmem->hpa_range.end = p->res->end; + + /* Snapshot the region configuration underneath the cxl_rwsem.region */ + cxlr_pmem->nr_mappings = p->nr_targets; + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; + + /* + * Regions never span CXL root devices, so by definition the + * bridge for one device is the same for all. + */ + if (i == 0) { + cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint); + if (!cxl_nvb) + return -ENODEV; + cxlr->cxl_nvb = cxl_nvb; + } + m->cxlmd = cxlmd; + get_device(&cxlmd->dev); + m->start = cxled->dpa_res->start; + m->size = resource_size(cxled->dpa_res); + m->position = i; + } + + dev = &cxlr_pmem->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_pmem_region_type; + cxlr_pmem->cxlr = cxlr; + cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem); + + return 0; +} + +static void cxl_dax_region_release(struct device *dev) +{ + struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev); + + kfree(cxlr_dax); +} + +static const struct attribute_group *cxl_dax_region_attribute_groups[] = { + &cxl_base_attribute_group, + NULL, +}; + +const struct device_type cxl_dax_region_type = { + .name = "cxl_dax_region", + .release = cxl_dax_region_release, + .groups = cxl_dax_region_attribute_groups, +}; + +static bool is_cxl_dax_region(struct device *dev) +{ + return dev->type == &cxl_dax_region_type; +} + +struct cxl_dax_region *to_cxl_dax_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev), + "not a cxl_dax_region device\n")) + return NULL; + return container_of(dev, struct cxl_dax_region, dev); +} +EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL"); + +static struct lock_class_key cxl_dax_region_key; + +static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_dax_region *cxlr_dax; + struct device *dev; + + guard(rwsem_read)(&cxl_rwsem.region); + if (p->state != CXL_CONFIG_COMMIT) + return ERR_PTR(-ENXIO); + + cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL); + if (!cxlr_dax) + return ERR_PTR(-ENOMEM); + + cxlr_dax->hpa_range.start = p->res->start; + cxlr_dax->hpa_range.end = p->res->end; + + dev = &cxlr_dax->dev; + cxlr_dax->cxlr = cxlr; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_dax_region_key); + device_set_pm_not_required(dev); + dev->parent = &cxlr->dev; + dev->bus = &cxl_bus_type; + dev->type = &cxl_dax_region_type; + + return cxlr_dax; +} + +static void cxlr_pmem_unregister(void *_cxlr_pmem) +{ + struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem; + struct cxl_region *cxlr = cxlr_pmem->cxlr; + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; + + /* + * Either the bridge is in ->remove() context under the device_lock(), + * or cxlr_release_nvdimm() is cancelling the bridge's release action + * for @cxlr_pmem and doing it itself (while manually holding the bridge + * lock). + */ + device_lock_assert(&cxl_nvb->dev); + cxlr->cxlr_pmem = NULL; + cxlr_pmem->cxlr = NULL; + device_unregister(&cxlr_pmem->dev); +} + +static void cxlr_release_nvdimm(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; + + scoped_guard(device, &cxl_nvb->dev) { + if (cxlr->cxlr_pmem) + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, + cxlr->cxlr_pmem); + } + cxlr->cxl_nvb = NULL; + put_device(&cxl_nvb->dev); +} + +/** + * devm_cxl_add_pmem_region() - add a cxl_region-to-nd_region bridge + * @cxlr: parent CXL region for this pmem region bridge device + * + * Return: 0 on success negative error code on failure. + */ +static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) +{ + struct cxl_pmem_region *cxlr_pmem; + struct cxl_nvdimm_bridge *cxl_nvb; + struct device *dev; + int rc; + + rc = cxl_pmem_region_alloc(cxlr); + if (rc) + return rc; + cxlr_pmem = cxlr->cxlr_pmem; + cxl_nvb = cxlr->cxl_nvb; + + dev = &cxlr_pmem->dev; + rc = dev_set_name(dev, "pmem_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + scoped_guard(device, &cxl_nvb->dev) { + if (cxl_nvb->dev.driver) + rc = devm_add_action_or_reset(&cxl_nvb->dev, + cxlr_pmem_unregister, + cxlr_pmem); + else + rc = -ENXIO; + } + + if (rc) + goto err_bridge; + + /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */ + return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr); + +err: + put_device(dev); +err_bridge: + put_device(&cxl_nvb->dev); + cxlr->cxl_nvb = NULL; + return rc; +} + +static void cxlr_dax_unregister(void *_cxlr_dax) +{ + struct cxl_dax_region *cxlr_dax = _cxlr_dax; + + device_unregister(&cxlr_dax->dev); +} + +static int devm_cxl_add_dax_region(struct cxl_region *cxlr) +{ + struct cxl_dax_region *cxlr_dax; + struct device *dev; + int rc; + + cxlr_dax = cxl_dax_region_alloc(cxlr); + if (IS_ERR(cxlr_dax)) + return PTR_ERR(cxlr_dax); + + dev = &cxlr_dax->dev; + rc = dev_set_name(dev, "dax_region%d", cxlr->id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), + dev_name(dev)); + + return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister, + cxlr_dax); +err: + put_device(dev); + return rc; +} + +static int match_decoder_by_range(struct device *dev, const void *data) +{ + const struct range *r1, *r2 = data; + struct cxl_decoder *cxld; + + if (!is_switch_decoder(dev)) + return 0; + + cxld = to_cxl_decoder(dev); + r1 = &cxld->hpa_range; + return range_contains(r1, r2); +} + +static struct cxl_decoder * +cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa) +{ + struct device *cxld_dev = device_find_child(&port->dev, hpa, + match_decoder_by_range); + + return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL; +} + +static struct cxl_root_decoder * +cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + struct cxl_decoder *root, *cxld = &cxled->cxld; + struct range *hpa = &cxld->hpa_range; + + root = cxl_port_find_switch_decoder(&cxl_root->port, hpa); + if (!root) { + dev_err(cxlmd->dev.parent, + "%s:%s no CXL window for range %#llx:%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxld->dev), + cxld->hpa_range.start, cxld->hpa_range.end); + return NULL; + } + + return to_cxl_root_decoder(&root->dev); +} + +static int match_region_by_range(struct device *dev, const void *data) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + const struct range *r = data; + + if (!is_cxl_region(dev)) + return 0; + + cxlr = to_cxl_region(dev); + p = &cxlr->params; + + guard(rwsem_read)(&cxl_rwsem.region); + return spa_maps_hpa(p, r); +} + +static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, + struct resource *res) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + resource_size_t size = resource_size(res); + resource_size_t cache_size, start; + + cache_size = cxlrd->cache_size; + if (!cache_size) + return 0; + + if (size != cache_size) { + dev_warn(&cxlr->dev, + "Extended Linear Cache size %pa != CXL size %pa. No Support!", + &cache_size, &size); + return -ENXIO; + } + + /* + * Move the start of the range to where the cache range starts. The + * implementation assumes that the cache range is in front of the + * CXL range. This is not dictated by the HMAT spec but is how the + * current known implementation is configured. + * + * The cache range is expected to be within the CFMWS. The adjusted + * res->start should not be less than cxlrd->res->start. + */ + start = res->start - cache_size; + if (start < cxlrd->res->start) + return -ENXIO; + + res->start = start; + p->cache_size = cache_size; + + return 0; +} + +static int __construct_region(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct range *hpa = &cxled->cxld.hpa_range; + struct cxl_region_params *p; + struct resource *res; + int rc; + + guard(rwsem_write)(&cxl_rwsem.region); + p = &cxlr->params; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s autodiscovery interrupted\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__); + return -EBUSY; + } + + set_bit(CXL_REGION_F_AUTO, &cxlr->flags); + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), + dev_name(&cxlr->dev)); + + rc = cxl_extended_linear_cache_resize(cxlr, res); + if (rc && rc != -EOPNOTSUPP) { + /* + * Failing to support extended linear cache region resize does not + * prevent the region from functioning. Only causes cxl list showing + * incorrect region size. + */ + dev_warn(cxlmd->dev.parent, + "Extended linear cache calculation failed rc:%d\n", rc); + } + + rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group); + if (rc) + return rc; + + rc = insert_resource(cxlrd->res, res); + if (rc) { + /* + * Platform-firmware may not have split resources like "System + * RAM" on CXL window boundaries see cxl_region_iomem_release() + */ + dev_warn(cxlmd->dev.parent, + "%s:%s: %s %s cannot insert resource\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__, dev_name(&cxlr->dev)); + } + + p->res = res; + p->interleave_ways = cxled->cxld.interleave_ways; + p->interleave_granularity = cxled->cxld.interleave_granularity; + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); + if (rc) + return rc; + + dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__, + dev_name(&cxlr->dev), p->res, p->interleave_ways, + p->interleave_granularity); + + /* ...to match put_device() in cxl_add_to_region() */ + get_device(&cxlr->dev); + + return 0; +} + +/* Establish an empty region covering the given HPA range */ +static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + int rc, part = READ_ONCE(cxled->part); + struct cxl_region *cxlr; + + do { + cxlr = __create_region(cxlrd, cxlds->part[part].mode, + atomic_read(&cxlrd->region_id)); + } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); + + if (IS_ERR(cxlr)) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s failed assign region: %ld\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__, PTR_ERR(cxlr)); + return cxlr; + } + + rc = __construct_region(cxlr, cxlrd, cxled); + if (rc) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return ERR_PTR(rc); + } + + return cxlr; +} + +static struct cxl_region * +cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa) +{ + struct device *region_dev; + + region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa, + match_region_by_range); + if (!region_dev) + return NULL; + + return to_cxl_region(region_dev); +} + +int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) +{ + struct range *hpa = &cxled->cxld.hpa_range; + struct cxl_region_params *p; + bool attach = false; + int rc; + + struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) = + cxl_find_root_decoder(cxled); + if (!cxlrd) + return -ENXIO; + + /* + * Ensure that if multiple threads race to construct_region() for @hpa + * one does the construction and the others add to that. + */ + mutex_lock(&cxlrd->range_lock); + struct cxl_region *cxlr __free(put_cxl_region) = + cxl_find_region_by_range(cxlrd, hpa); + if (!cxlr) + cxlr = construct_region(cxlrd, cxled); + mutex_unlock(&cxlrd->range_lock); + + rc = PTR_ERR_OR_ZERO(cxlr); + if (rc) + return rc; + + attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE); + + scoped_guard(rwsem_read, &cxl_rwsem.region) { + p = &cxlr->params; + attach = p->state == CXL_CONFIG_COMMIT; + } + + if (attach) { + /* + * If device_attach() fails the range may still be active via + * the platform-firmware memory map, otherwise the driver for + * regions is local to this file, so driver matching can't fail. + */ + if (device_attach(&cxlr->dev) < 0) + dev_err(&cxlr->dev, "failed to enable, range: %pr\n", + p->res); + } + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL"); + +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa) +{ + struct cxl_region_ref *iter; + unsigned long index; + + if (!endpoint) + return ~0ULL; + + guard(rwsem_write)(&cxl_rwsem.region); + + xa_for_each(&endpoint->regions, index, iter) { + struct cxl_region_params *p = &iter->region->params; + + if (cxl_resource_contains_addr(p->res, spa)) { + if (!p->cache_size) + return ~0ULL; + + if (spa >= p->res->start + p->cache_size) + return spa - p->cache_size; + + return spa + p->cache_size; + } + } + + return ~0ULL; +} +EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL"); + +static int is_system_ram(struct resource *res, void *arg) +{ + struct cxl_region *cxlr = arg; + struct cxl_region_params *p = &cxlr->params; + + dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res); + return 1; +} + +static void shutdown_notifiers(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + + unregister_node_notifier(&cxlr->node_notifier); + unregister_mt_adistance_algorithm(&cxlr->adist_notifier); +} + +static void remove_debugfs(void *dentry) +{ + debugfs_remove_recursive(dentry); +} + +static int validate_region_offset(struct cxl_region *cxlr, u64 offset) +{ + struct cxl_region_params *p = &cxlr->params; + resource_size_t region_size; + u64 hpa; + + if (offset < p->cache_size) { + dev_err(&cxlr->dev, + "Offset %#llx is within extended linear cache %pa\n", + offset, &p->cache_size); + return -EINVAL; + } + + region_size = resource_size(p->res); + if (offset >= region_size) { + dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n", + offset, ®ion_size); + return -EINVAL; + } + + hpa = p->res->start + offset; + if (hpa < p->res->start || hpa > p->res->end) { + dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa, + p->res); + return -EINVAL; + } + + return 0; +} + +static int cxl_region_debugfs_poison_inject(void *data, u64 offset) +{ + struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL }; + struct cxl_region *cxlr = data; + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + if (validate_region_offset(cxlr, offset)) + return -EINVAL; + + offset -= cxlr->params.cache_size; + rc = region_offset_to_dpa_result(cxlr, offset, &result); + if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { + dev_dbg(&cxlr->dev, + "Failed to resolve DPA for region offset %#llx rc %d\n", + offset, rc); + + return rc ? rc : -EINVAL; + } + + return cxl_inject_poison_locked(result.cxlmd, result.dpa); +} + +DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL, + cxl_region_debugfs_poison_inject, "%llx\n"); + +static int cxl_region_debugfs_poison_clear(void *data, u64 offset) +{ + struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL }; + struct cxl_region *cxlr = data; + int rc; + + ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem))) + return rc; + + ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem))) + return rc; + + if (validate_region_offset(cxlr, offset)) + return -EINVAL; + + offset -= cxlr->params.cache_size; + rc = region_offset_to_dpa_result(cxlr, offset, &result); + if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) { + dev_dbg(&cxlr->dev, + "Failed to resolve DPA for region offset %#llx rc %d\n", + offset, rc); + + return rc ? rc : -EINVAL; + } + + return cxl_clear_poison_locked(result.cxlmd, result.dpa); +} + +DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, + cxl_region_debugfs_poison_clear, "%llx\n"); + +static int cxl_region_can_probe(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region); + if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) { + dev_dbg(&cxlr->dev, "probe interrupted\n"); + return rc; + } + + if (p->state < CXL_CONFIG_COMMIT) { + dev_dbg(&cxlr->dev, "config state: %d\n", p->state); + return -ENXIO; + } + + if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) { + dev_err(&cxlr->dev, + "failed to activate, re-commit region and retry\n"); + return -ENXIO; + } + + return 0; +} + +static int cxl_region_probe(struct device *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + bool poison_supported = true; + int rc; + + rc = cxl_region_can_probe(cxlr); + if (rc) + return rc; + + /* + * From this point on any path that changes the region's state away from + * CXL_CONFIG_COMMIT is also responsible for releasing the driver. + */ + + cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback; + cxlr->node_notifier.priority = CXL_CALLBACK_PRI; + register_node_notifier(&cxlr->node_notifier); + + cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance; + cxlr->adist_notifier.priority = 100; + register_mt_adistance_algorithm(&cxlr->adist_notifier); + + rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr); + if (rc) + return rc; + + /* Create poison attributes if all memdevs support the capabilities */ + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) || + !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) { + poison_supported = false; + break; + } + } + + if (poison_supported) { + struct dentry *dentry; + + dentry = cxl_debugfs_create_dir(dev_name(dev)); + debugfs_create_file("inject_poison", 0200, dentry, cxlr, + &cxl_poison_inject_fops); + debugfs_create_file("clear_poison", 0200, dentry, cxlr, + &cxl_poison_clear_fops); + rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); + if (rc) + return rc; + } + + switch (cxlr->mode) { + case CXL_PARTMODE_PMEM: + rc = devm_cxl_region_edac_register(cxlr); + if (rc) + dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n", + cxlr->id); + + return devm_cxl_add_pmem_region(cxlr); + case CXL_PARTMODE_RAM: + rc = devm_cxl_region_edac_register(cxlr); + if (rc) + dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n", + cxlr->id); + + /* + * The region can not be manged by CXL if any portion of + * it is already online as 'System RAM' + */ + if (walk_iomem_res_desc(IORES_DESC_NONE, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + p->res->start, p->res->end, cxlr, + is_system_ram) > 0) + return 0; + return devm_cxl_add_dax_region(cxlr); + default: + dev_dbg(&cxlr->dev, "unsupported region mode: %d\n", + cxlr->mode); + return -ENXIO; + } +} + +static struct cxl_driver cxl_region_driver = { + .name = "cxl_region", + .probe = cxl_region_probe, + .id = CXL_DEVICE_REGION, +}; + +int cxl_region_init(void) +{ + return cxl_driver_register(&cxl_region_driver); +} + +void cxl_region_exit(void) +{ + cxl_driver_unregister(&cxl_region_driver); +} + +MODULE_IMPORT_NS("CXL"); +MODULE_IMPORT_NS("DEVMEM"); +MODULE_ALIAS_CXL(CXL_DEVICE_REGION); diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c new file mode 100644 index 000000000000..5ca7b0eed568 --- /dev/null +++ b/drivers/cxl/core/regs.c @@ -0,0 +1,644 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2020 Intel Corporation. */ +#include <linux/io-64-nonatomic-lo-hi.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <cxlmem.h> +#include <cxlpci.h> +#include <pmu.h> + +#include "core.h" + +/** + * DOC: cxl registers + * + * CXL device capabilities are enumerated by PCI DVSEC (Designated + * Vendor-specific) and / or descriptors provided by platform firmware. + * They can be defined as a set like the device and component registers + * mandated by CXL Section 8.1.12.2 Memory Device PCIe Capabilities and + * Extended Capabilities, or they can be individual capabilities + * appended to bridged and endpoint devices. + * + * Provide common infrastructure for enumerating and mapping these + * discrete capabilities. + */ + +/** + * cxl_probe_component_regs() - Detect CXL Component register blocks + * @dev: Host device of the @base mapping + * @base: Mapping containing the HDM Decoder Capability Header + * @map: Map object describing the register block information found + * + * See CXL 2.0 8.2.4 Component Register Layout and Definition + * See CXL 2.0 8.2.5.5 CXL Device Register Interface + * + * Probe for component register information and return it in map object. + */ +void cxl_probe_component_regs(struct device *dev, void __iomem *base, + struct cxl_component_reg_map *map) +{ + int cap, cap_count; + u32 cap_array; + + *map = (struct cxl_component_reg_map) { 0 }; + + /* + * CXL.cache and CXL.mem registers are at offset 0x1000 as defined in + * CXL 2.0 8.2.4 Table 141. + */ + base += CXL_CM_OFFSET; + + cap_array = readl(base + CXL_CM_CAP_HDR_OFFSET); + + if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) { + dev_dbg(dev, + "Couldn't locate the CXL.cache and CXL.mem capability array header.\n"); + return; + } + + /* It's assumed that future versions will be backward compatible */ + cap_count = FIELD_GET(CXL_CM_CAP_HDR_ARRAY_SIZE_MASK, cap_array); + + for (cap = 1; cap <= cap_count; cap++) { + void __iomem *register_block; + struct cxl_reg_map *rmap; + u16 cap_id, offset; + u32 length, hdr; + + hdr = readl(base + cap * 0x4); + + cap_id = FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, hdr); + offset = FIELD_GET(CXL_CM_CAP_PTR_MASK, hdr); + register_block = base + offset; + hdr = readl(register_block); + + rmap = NULL; + switch (cap_id) { + case CXL_CM_CAP_CAP_ID_HDM: { + int decoder_cnt; + + dev_dbg(dev, "found HDM decoder capability (0x%x)\n", + offset); + + decoder_cnt = cxl_hdm_decoder_count(hdr); + length = 0x20 * decoder_cnt + 0x10; + rmap = &map->hdm_decoder; + break; + } + case CXL_CM_CAP_CAP_ID_RAS: + dev_dbg(dev, "found RAS capability (0x%x)\n", + offset); + length = CXL_RAS_CAPABILITY_LENGTH; + rmap = &map->ras; + break; + default: + dev_dbg(dev, "Unknown CM cap ID: %d (0x%x)\n", cap_id, + offset); + break; + } + + if (!rmap) + continue; + rmap->valid = true; + rmap->id = cap_id; + rmap->offset = CXL_CM_OFFSET + offset; + rmap->size = length; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_probe_component_regs, "CXL"); + +/** + * cxl_probe_device_regs() - Detect CXL Device register blocks + * @dev: Host device of the @base mapping + * @base: Mapping of CXL 2.0 8.2.8 CXL Device Register Interface + * @map: Map object describing the register block information found + * + * Probe for device register information and return it in map object. + */ +void cxl_probe_device_regs(struct device *dev, void __iomem *base, + struct cxl_device_reg_map *map) +{ + int cap, cap_count; + u64 cap_array; + + *map = (struct cxl_device_reg_map){ 0 }; + + cap_array = readq(base + CXLDEV_CAP_ARRAY_OFFSET); + if (FIELD_GET(CXLDEV_CAP_ARRAY_ID_MASK, cap_array) != + CXLDEV_CAP_ARRAY_CAP_ID) + return; + + cap_count = FIELD_GET(CXLDEV_CAP_ARRAY_COUNT_MASK, cap_array); + + for (cap = 1; cap <= cap_count; cap++) { + struct cxl_reg_map *rmap; + u32 offset, length; + u16 cap_id; + + cap_id = FIELD_GET(CXLDEV_CAP_HDR_CAP_ID_MASK, + readl(base + cap * 0x10)); + offset = readl(base + cap * 0x10 + 0x4); + length = readl(base + cap * 0x10 + 0x8); + + rmap = NULL; + switch (cap_id) { + case CXLDEV_CAP_CAP_ID_DEVICE_STATUS: + dev_dbg(dev, "found Status capability (0x%x)\n", offset); + rmap = &map->status; + break; + case CXLDEV_CAP_CAP_ID_PRIMARY_MAILBOX: + dev_dbg(dev, "found Mailbox capability (0x%x)\n", offset); + rmap = &map->mbox; + break; + case CXLDEV_CAP_CAP_ID_SECONDARY_MAILBOX: + dev_dbg(dev, "found Secondary Mailbox capability (0x%x)\n", offset); + break; + case CXLDEV_CAP_CAP_ID_MEMDEV: + dev_dbg(dev, "found Memory Device capability (0x%x)\n", offset); + rmap = &map->memdev; + break; + default: + if (cap_id >= 0x8000) + dev_dbg(dev, "Vendor cap ID: %#x offset: %#x\n", cap_id, offset); + else + dev_dbg(dev, "Unknown cap ID: %#x offset: %#x\n", cap_id, offset); + break; + } + + if (!rmap) + continue; + rmap->valid = true; + rmap->id = cap_id; + rmap->offset = offset; + rmap->size = length; + } +} +EXPORT_SYMBOL_NS_GPL(cxl_probe_device_regs, "CXL"); + +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length) +{ + void __iomem *ret_val; + struct resource *res; + + if (WARN_ON_ONCE(addr == CXL_RESOURCE_NONE)) + return NULL; + + res = devm_request_mem_region(dev, addr, length, dev_name(dev)); + if (!res) { + resource_size_t end = addr + length - 1; + + dev_err(dev, "Failed to request region %pa-%pa\n", &addr, &end); + return NULL; + } + + ret_val = devm_ioremap(dev, addr, length); + if (!ret_val) + dev_err(dev, "Failed to map region %pr\n", res); + + return ret_val; +} + +int cxl_map_component_regs(const struct cxl_register_map *map, + struct cxl_component_regs *regs, + unsigned long map_mask) +{ + struct device *host = map->host; + struct mapinfo { + const struct cxl_reg_map *rmap; + void __iomem **addr; + } mapinfo[] = { + { &map->component_map.hdm_decoder, ®s->hdm_decoder }, + { &map->component_map.ras, ®s->ras }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(mapinfo); i++) { + struct mapinfo *mi = &mapinfo[i]; + resource_size_t addr; + resource_size_t length; + + if (!mi->rmap->valid) + continue; + if (!test_bit(mi->rmap->id, &map_mask)) + continue; + addr = map->resource + mi->rmap->offset; + length = mi->rmap->size; + *(mi->addr) = devm_cxl_iomap_block(host, addr, length); + if (!*(mi->addr)) + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_map_component_regs, "CXL"); + +int cxl_map_device_regs(const struct cxl_register_map *map, + struct cxl_device_regs *regs) +{ + struct device *host = map->host; + resource_size_t phys_addr = map->resource; + struct mapinfo { + const struct cxl_reg_map *rmap; + void __iomem **addr; + } mapinfo[] = { + { &map->device_map.status, ®s->status, }, + { &map->device_map.mbox, ®s->mbox, }, + { &map->device_map.memdev, ®s->memdev, }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(mapinfo); i++) { + struct mapinfo *mi = &mapinfo[i]; + resource_size_t length; + resource_size_t addr; + + if (!mi->rmap->valid) + continue; + + addr = phys_addr + mi->rmap->offset; + length = mi->rmap->size; + *(mi->addr) = devm_cxl_iomap_block(host, addr, length); + if (!*(mi->addr)) + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL"); + +static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, + struct cxl_register_map *map) +{ + u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); + int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + u64 offset = ((u64)reg_hi << 32) | + (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + + if (offset > pci_resource_len(pdev, bar)) { + dev_warn(&pdev->dev, + "BAR%d: %pr: too small (offset: %pa, type: %d)\n", bar, + &pdev->resource[bar], &offset, reg_type); + return false; + } + + map->reg_type = reg_type; + map->resource = pci_resource_start(pdev, bar) + offset; + map->max_size = pci_resource_len(pdev, bar) - offset; + return true; +} + +/* + * __cxl_find_regblock_instance() - Locate a register block or count instances by type / index + * Use CXL_INSTANCES_COUNT for @index if counting instances. + * + * __cxl_find_regblock_instance() may return: + * 0 - if register block enumerated. + * >= 0 - if counting instances. + * < 0 - error code otherwise. + */ +static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map, int index) +{ + u32 regloc_size, regblocks; + int instance = 0; + int regloc, i; + + *map = (struct cxl_register_map) { + .host = &pdev->dev, + .resource = CXL_RESOURCE_NONE, + }; + + regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + CXL_DVSEC_REG_LOCATOR); + if (!regloc) + return -ENXIO; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + + regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; + regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + + for (i = 0; i < regblocks; i++, regloc += 8) { + u32 reg_lo, reg_hi; + + pci_read_config_dword(pdev, regloc, ®_lo); + pci_read_config_dword(pdev, regloc + 4, ®_hi); + + if (!cxl_decode_regblock(pdev, reg_lo, reg_hi, map)) + continue; + + if (map->reg_type == type) { + if (index == instance) + return 0; + instance++; + } + } + + map->resource = CXL_RESOURCE_NONE; + if (index == CXL_INSTANCES_COUNT) + return instance; + + return -ENODEV; +} + +/** + * cxl_find_regblock_instance() - Locate a register block by type / index + * @pdev: The CXL PCI device to enumerate. + * @type: Register Block Indicator id + * @map: Enumeration output, clobbered on error + * @index: Index into which particular instance of a regblock wanted in the + * order found in register locator DVSEC. + * + * Return: 0 if register block enumerated, negative error code otherwise + * + * A CXL DVSEC may point to one or more register blocks, search for them + * by @type and @index. + */ +int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map, unsigned int index) +{ + return __cxl_find_regblock_instance(pdev, type, map, index); +} +EXPORT_SYMBOL_NS_GPL(cxl_find_regblock_instance, "CXL"); + +/** + * cxl_find_regblock() - Locate register blocks by type + * @pdev: The CXL PCI device to enumerate. + * @type: Register Block Indicator id + * @map: Enumeration output, clobbered on error + * + * Return: 0 if register block enumerated, negative error code otherwise + * + * A CXL DVSEC may point to one or more register blocks, search for them + * by @type. + */ +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + return __cxl_find_regblock_instance(pdev, type, map, 0); +} +EXPORT_SYMBOL_NS_GPL(cxl_find_regblock, "CXL"); + +/** + * cxl_count_regblock() - Count instances of a given regblock type. + * @pdev: The CXL PCI device to enumerate. + * @type: Register Block Indicator id + * + * Some regblocks may be repeated. Count how many instances. + * + * Return: non-negative count of matching regblocks, negative error code otherwise. + */ +int cxl_count_regblock(struct pci_dev *pdev, enum cxl_regloc_type type) +{ + struct cxl_register_map map; + + return __cxl_find_regblock_instance(pdev, type, &map, CXL_INSTANCES_COUNT); +} +EXPORT_SYMBOL_NS_GPL(cxl_count_regblock, "CXL"); + +int cxl_map_pmu_regs(struct cxl_register_map *map, struct cxl_pmu_regs *regs) +{ + struct device *dev = map->host; + resource_size_t phys_addr; + + phys_addr = map->resource; + regs->pmu = devm_cxl_iomap_block(dev, phys_addr, CXL_PMU_REGMAP_SIZE); + if (!regs->pmu) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_map_pmu_regs, "CXL"); + +static int cxl_map_regblock(struct cxl_register_map *map) +{ + struct device *host = map->host; + + map->base = ioremap(map->resource, map->max_size); + if (!map->base) { + dev_err(host, "failed to map registers\n"); + return -ENOMEM; + } + + dev_dbg(host, "Mapped CXL Memory Device resource %pa\n", &map->resource); + return 0; +} + +static void cxl_unmap_regblock(struct cxl_register_map *map) +{ + iounmap(map->base); + map->base = NULL; +} + +static int cxl_probe_regs(struct cxl_register_map *map) +{ + struct cxl_component_reg_map *comp_map; + struct cxl_device_reg_map *dev_map; + struct device *host = map->host; + void __iomem *base = map->base; + + switch (map->reg_type) { + case CXL_REGLOC_RBI_COMPONENT: + comp_map = &map->component_map; + cxl_probe_component_regs(host, base, comp_map); + dev_dbg(host, "Set up component registers\n"); + break; + case CXL_REGLOC_RBI_MEMDEV: + dev_map = &map->device_map; + cxl_probe_device_regs(host, base, dev_map); + if (!dev_map->status.valid || !dev_map->mbox.valid || + !dev_map->memdev.valid) { + dev_err(host, "registers not found: %s%s%s\n", + !dev_map->status.valid ? "status " : "", + !dev_map->mbox.valid ? "mbox " : "", + !dev_map->memdev.valid ? "memdev " : ""); + return -ENXIO; + } + + dev_dbg(host, "Probing device registers...\n"); + break; + default: + break; + } + + return 0; +} + +int cxl_setup_regs(struct cxl_register_map *map) +{ + int rc; + + rc = cxl_map_regblock(map); + if (rc) + return rc; + + rc = cxl_probe_regs(map); + cxl_unmap_regblock(map); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(cxl_setup_regs, "CXL"); + +u16 cxl_rcrb_to_aer(struct device *dev, resource_size_t rcrb) +{ + void __iomem *addr; + u16 offset = 0; + u32 cap_hdr; + + if (WARN_ON_ONCE(rcrb == CXL_RESOURCE_NONE)) + return 0; + + if (!request_mem_region(rcrb, SZ_4K, dev_name(dev))) + return 0; + + addr = ioremap(rcrb, SZ_4K); + if (!addr) + goto out; + + cap_hdr = readl(addr + offset); + while (PCI_EXT_CAP_ID(cap_hdr) != PCI_EXT_CAP_ID_ERR) { + offset = PCI_EXT_CAP_NEXT(cap_hdr); + + /* Offset 0 terminates capability list. */ + if (!offset) + break; + cap_hdr = readl(addr + offset); + } + + if (offset) + dev_dbg(dev, "found AER extended capability (0x%x)\n", offset); + + iounmap(addr); +out: + release_mem_region(rcrb, SZ_4K); + + return offset; +} + +static resource_size_t cxl_rcrb_to_linkcap(struct device *dev, struct cxl_dport *dport) +{ + resource_size_t rcrb = dport->rcrb.base; + void __iomem *addr; + u32 cap_hdr; + u16 offset; + + if (!request_mem_region(rcrb, SZ_4K, "CXL RCRB")) + return CXL_RESOURCE_NONE; + + addr = ioremap(rcrb, SZ_4K); + if (!addr) { + dev_err(dev, "Failed to map region %pr\n", addr); + release_mem_region(rcrb, SZ_4K); + return CXL_RESOURCE_NONE; + } + + offset = FIELD_GET(PCI_RCRB_CAP_LIST_ID_MASK, readw(addr + PCI_CAPABILITY_LIST)); + cap_hdr = readl(addr + offset); + while ((FIELD_GET(PCI_RCRB_CAP_HDR_ID_MASK, cap_hdr)) != PCI_CAP_ID_EXP) { + offset = FIELD_GET(PCI_RCRB_CAP_HDR_NEXT_MASK, cap_hdr); + if (offset == 0 || offset > SZ_4K) { + offset = 0; + break; + } + cap_hdr = readl(addr + offset); + } + + iounmap(addr); + release_mem_region(rcrb, SZ_4K); + if (!offset) + return CXL_RESOURCE_NONE; + + return offset; +} + +int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport) +{ + void __iomem *dport_pcie_cap = NULL; + resource_size_t pos; + struct cxl_rcrb_info *ri; + + ri = &dport->rcrb; + pos = cxl_rcrb_to_linkcap(&pdev->dev, dport); + if (pos == CXL_RESOURCE_NONE) + return -ENXIO; + + dport_pcie_cap = devm_cxl_iomap_block(&pdev->dev, + ri->base + pos, + PCI_CAP_EXP_SIZEOF); + dport->regs.rcd_pcie_cap = dport_pcie_cap; + + return 0; +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_map_rcd_linkcap, "CXL"); + +resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri, + enum cxl_rcrb which) +{ + resource_size_t component_reg_phys; + resource_size_t rcrb = ri->base; + void __iomem *addr; + u32 bar0, bar1; + u32 id; + + if (which == CXL_RCRB_UPSTREAM) + rcrb += SZ_4K; + + /* + * RCRB's BAR[0..1] point to component block containing CXL + * subsystem component registers. MEMBAR extraction follows + * the PCI Base spec here, esp. 64 bit extraction and memory + * ranges alignment (6.0, 7.5.1.2.1). + */ + if (!request_mem_region(rcrb, SZ_4K, "CXL RCRB")) + return CXL_RESOURCE_NONE; + addr = ioremap(rcrb, SZ_4K); + if (!addr) { + dev_err(dev, "Failed to map region %pr\n", addr); + release_mem_region(rcrb, SZ_4K); + return CXL_RESOURCE_NONE; + } + + id = readl(addr + PCI_VENDOR_ID); + bar0 = readl(addr + PCI_BASE_ADDRESS_0); + bar1 = readl(addr + PCI_BASE_ADDRESS_1); + iounmap(addr); + release_mem_region(rcrb, SZ_4K); + + /* + * Sanity check, see CXL 3.0 Figure 9-8 CXL Device that Does Not + * Remap Upstream Port and Component Registers + */ + if (id == U32_MAX) { + if (which == CXL_RCRB_DOWNSTREAM) + dev_err(dev, "Failed to access Downstream Port RCRB\n"); + return CXL_RESOURCE_NONE; + } + /* The RCRB is a Memory Window, and the MEM_TYPE_1M bit is obsolete */ + if (bar0 & (PCI_BASE_ADDRESS_MEM_TYPE_1M | PCI_BASE_ADDRESS_SPACE_IO)) + return CXL_RESOURCE_NONE; + + component_reg_phys = bar0 & PCI_BASE_ADDRESS_MEM_MASK; + if (bar0 & PCI_BASE_ADDRESS_MEM_TYPE_64) + component_reg_phys |= ((u64)bar1) << 32; + + if (!component_reg_phys) + return CXL_RESOURCE_NONE; + + /* MEMBAR is block size (64k) aligned. */ + if (!IS_ALIGNED(component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE)) + return CXL_RESOURCE_NONE; + + return component_reg_phys; +} + +resource_size_t cxl_rcd_component_reg_phys(struct device *dev, + struct cxl_dport *dport) +{ + if (!dport->rch) + return CXL_RESOURCE_NONE; + return __rcrb_to_component(dev, &dport->rcrb, CXL_RCRB_UPSTREAM); +} +EXPORT_SYMBOL_NS_GPL(cxl_rcd_component_reg_phys, "CXL"); diff --git a/drivers/cxl/core/suspend.c b/drivers/cxl/core/suspend.c new file mode 100644 index 000000000000..29aa5cc5e565 --- /dev/null +++ b/drivers/cxl/core/suspend.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include <linux/atomic.h> +#include <linux/export.h> +#include "cxlmem.h" + +static atomic_t mem_active; + +bool cxl_mem_active(void) +{ + return atomic_read(&mem_active) != 0; +} + +void cxl_mem_active_inc(void) +{ + atomic_inc(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_inc, "CXL"); + +void cxl_mem_active_dec(void) +{ + atomic_dec(&mem_active); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_active_dec, "CXL"); diff --git a/drivers/cxl/core/trace.c b/drivers/cxl/core/trace.c new file mode 100644 index 000000000000..7f2a9dd0d0e3 --- /dev/null +++ b/drivers/cxl/core/trace.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ + +#include <cxl.h> +#include "core.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h new file mode 100644 index 000000000000..a972e4ef1936 --- /dev/null +++ b/drivers/cxl/core/trace.h @@ -0,0 +1,1105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cxl + +#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _CXL_EVENTS_H + +#include <linux/tracepoint.h> +#include <linux/pci.h> +#include <linux/unaligned.h> + +#include <cxl.h> +#include <cxlmem.h> +#include "core.h" + +#define CXL_RAS_UC_CACHE_DATA_PARITY BIT(0) +#define CXL_RAS_UC_CACHE_ADDR_PARITY BIT(1) +#define CXL_RAS_UC_CACHE_BE_PARITY BIT(2) +#define CXL_RAS_UC_CACHE_DATA_ECC BIT(3) +#define CXL_RAS_UC_MEM_DATA_PARITY BIT(4) +#define CXL_RAS_UC_MEM_ADDR_PARITY BIT(5) +#define CXL_RAS_UC_MEM_BE_PARITY BIT(6) +#define CXL_RAS_UC_MEM_DATA_ECC BIT(7) +#define CXL_RAS_UC_REINIT_THRESH BIT(8) +#define CXL_RAS_UC_RSVD_ENCODE BIT(9) +#define CXL_RAS_UC_POISON BIT(10) +#define CXL_RAS_UC_RECV_OVERFLOW BIT(11) +#define CXL_RAS_UC_INTERNAL_ERR BIT(14) +#define CXL_RAS_UC_IDE_TX_ERR BIT(15) +#define CXL_RAS_UC_IDE_RX_ERR BIT(16) + +#define show_uc_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" }, \ + { CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" }, \ + { CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \ + { CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" }, \ + { CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" }, \ + { CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" }, \ + { CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" }, \ + { CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" }, \ + { CXL_RAS_UC_POISON, "Received Poison From Peer" }, \ + { CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" }, \ + { CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" }, \ + { CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" }, \ + { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ +) + +TRACE_EVENT(cxl_port_aer_uncorrectable_error, + TP_PROTO(struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("device=%s host=%s status: '%s' first_error: '%s'", + __get_str(device), __get_str(host), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + +TRACE_EVENT(cxl_aer_uncorrectable_error, + TP_PROTO(const struct cxl_memdev *cxlmd, u32 status, u32 fe, u32 *hl), + TP_ARGS(cxlmd, status, fe, hl), + TP_STRUCT__entry( + __string(memdev, dev_name(&cxlmd->dev)) + __string(host, dev_name(cxlmd->dev.parent)) + __field(u64, serial) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(memdev); + __assign_str(host); + __entry->serial = cxlmd->cxlds->serial; + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("memdev=%s host=%s serial=%lld: status: '%s' first_error: '%s'", + __get_str(memdev), __get_str(host), __entry->serial, + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + +#define CXL_RAS_CE_CACHE_DATA_ECC BIT(0) +#define CXL_RAS_CE_MEM_DATA_ECC BIT(1) +#define CXL_RAS_CE_CRC_THRESH BIT(2) +#define CLX_RAS_CE_RETRY_THRESH BIT(3) +#define CXL_RAS_CE_CACHE_POISON BIT(4) +#define CXL_RAS_CE_MEM_POISON BIT(5) +#define CXL_RAS_CE_PHYS_LAYER_ERR BIT(6) + +#define show_ce_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" }, \ + { CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" }, \ + { CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" }, \ + { CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" }, \ + { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ +) + +TRACE_EVENT(cxl_port_aer_correctable_error, + TP_PROTO(struct device *dev, u32 status), + TP_ARGS(dev, status), + TP_STRUCT__entry( + __string(device, dev_name(dev)) + __string(host, dev_name(dev->parent)) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(device); + __assign_str(host); + __entry->status = status; + ), + TP_printk("device=%s host=%s status='%s'", + __get_str(device), __get_str(host), + show_ce_errs(__entry->status) + ) +); + +TRACE_EVENT(cxl_aer_correctable_error, + TP_PROTO(const struct cxl_memdev *cxlmd, u32 status), + TP_ARGS(cxlmd, status), + TP_STRUCT__entry( + __string(memdev, dev_name(&cxlmd->dev)) + __string(host, dev_name(cxlmd->dev.parent)) + __field(u64, serial) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(memdev); + __assign_str(host); + __entry->serial = cxlmd->cxlds->serial; + __entry->status = status; + ), + TP_printk("memdev=%s host=%s serial=%lld: status: '%s'", + __get_str(memdev), __get_str(host), __entry->serial, + show_ce_errs(__entry->status) + ) +); + +#define cxl_event_log_type_str(type) \ + __print_symbolic(type, \ + { CXL_EVENT_TYPE_INFO, "Informational" }, \ + { CXL_EVENT_TYPE_WARN, "Warning" }, \ + { CXL_EVENT_TYPE_FAIL, "Failure" }, \ + { CXL_EVENT_TYPE_FATAL, "Fatal" }) + +TRACE_EVENT(cxl_overflow, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + struct cxl_get_event_payload *payload), + + TP_ARGS(cxlmd, log, payload), + + TP_STRUCT__entry( + __string(memdev, dev_name(&cxlmd->dev)) + __string(host, dev_name(cxlmd->dev.parent)) + __field(int, log) + __field(u64, serial) + __field(u64, first_ts) + __field(u64, last_ts) + __field(u16, count) + ), + + TP_fast_assign( + __assign_str(memdev); + __assign_str(host); + __entry->serial = cxlmd->cxlds->serial; + __entry->log = log; + __entry->count = le16_to_cpu(payload->overflow_err_count); + __entry->first_ts = le64_to_cpu(payload->first_overflow_timestamp); + __entry->last_ts = le64_to_cpu(payload->last_overflow_timestamp); + ), + + TP_printk("memdev=%s host=%s serial=%lld: log=%s : %u records from %llu to %llu", + __get_str(memdev), __get_str(host), __entry->serial, + cxl_event_log_type_str(__entry->log), __entry->count, + __entry->first_ts, __entry->last_ts) + +); + +/* + * Common Event Record Format + * CXL 3.0 section 8.2.9.2.1; Table 8-42 + */ +#define CXL_EVENT_RECORD_FLAG_PERMANENT BIT(2) +#define CXL_EVENT_RECORD_FLAG_MAINT_NEEDED BIT(3) +#define CXL_EVENT_RECORD_FLAG_PERF_DEGRADED BIT(4) +#define CXL_EVENT_RECORD_FLAG_HW_REPLACE BIT(5) +#define CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID BIT(6) +#define CXL_EVENT_RECORD_FLAG_LD_ID_VALID BIT(7) +#define CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID BIT(8) +#define show_hdr_flags(flags) __print_flags(flags, " | ", \ + { CXL_EVENT_RECORD_FLAG_PERMANENT, "PERMANENT_CONDITION" }, \ + { CXL_EVENT_RECORD_FLAG_MAINT_NEEDED, "MAINTENANCE_NEEDED" }, \ + { CXL_EVENT_RECORD_FLAG_PERF_DEGRADED, "PERFORMANCE_DEGRADED" }, \ + { CXL_EVENT_RECORD_FLAG_HW_REPLACE, "HARDWARE_REPLACEMENT_NEEDED" }, \ + { CXL_EVENT_RECORD_FLAG_MAINT_OP_SUB_CLASS_VALID, "MAINT_OP_SUB_CLASS_VALID" }, \ + { CXL_EVENT_RECORD_FLAG_LD_ID_VALID, "LD_ID_VALID" }, \ + { CXL_EVENT_RECORD_FLAG_HEAD_ID_VALID, "HEAD_ID_VALID" } \ +) + +/* + * Define macros for the common header of each CXL event. + * + * Tracepoints using these macros must do 3 things: + * + * 1) Add CXL_EVT_TP_entry to TP_STRUCT__entry + * 2) Use CXL_EVT_TP_fast_assign within TP_fast_assign; + * pass the dev, log, and CXL event header + * NOTE: The uuid must be assigned by the specific trace event + * 3) Use CXL_EVT_TP_printk() instead of TP_printk() + * + * See the generic_event tracepoint as an example. + */ +#define CXL_EVT_TP_entry \ + __string(memdev, dev_name(&cxlmd->dev)) \ + __string(host, dev_name(cxlmd->dev.parent)) \ + __field(int, log) \ + __field_struct(uuid_t, hdr_uuid) \ + __field(u64, serial) \ + __field(u32, hdr_flags) \ + __field(u16, hdr_handle) \ + __field(u16, hdr_related_handle) \ + __field(u64, hdr_timestamp) \ + __field(u8, hdr_length) \ + __field(u8, hdr_maint_op_class) \ + __field(u8, hdr_maint_op_sub_class) \ + __field(u16, hdr_ld_id) \ + __field(u8, hdr_head_id) + +#define CXL_EVT_TP_fast_assign(cxlmd, l, hdr) \ + __assign_str(memdev); \ + __assign_str(host); \ + __entry->log = (l); \ + __entry->serial = (cxlmd)->cxlds->serial; \ + __entry->hdr_length = (hdr).length; \ + __entry->hdr_flags = get_unaligned_le24((hdr).flags); \ + __entry->hdr_handle = le16_to_cpu((hdr).handle); \ + __entry->hdr_related_handle = le16_to_cpu((hdr).related_handle); \ + __entry->hdr_timestamp = le64_to_cpu((hdr).timestamp); \ + __entry->hdr_maint_op_class = (hdr).maint_op_class; \ + __entry->hdr_maint_op_sub_class = (hdr).maint_op_sub_class; \ + __entry->hdr_ld_id = le16_to_cpu((hdr).ld_id); \ + __entry->hdr_head_id = (hdr).head_id + +#define CXL_EVT_TP_printk(fmt, ...) \ + TP_printk("memdev=%s host=%s serial=%lld log=%s : time=%llu uuid=%pUb " \ + "len=%d flags='%s' handle=%x related_handle=%x " \ + "maint_op_class=%u maint_op_sub_class=%u " \ + "ld_id=%x head_id=%x : " fmt, \ + __get_str(memdev), __get_str(host), __entry->serial, \ + cxl_event_log_type_str(__entry->log), \ + __entry->hdr_timestamp, &__entry->hdr_uuid, __entry->hdr_length,\ + show_hdr_flags(__entry->hdr_flags), __entry->hdr_handle, \ + __entry->hdr_related_handle, __entry->hdr_maint_op_class, \ + __entry->hdr_maint_op_sub_class, \ + __entry->hdr_ld_id, __entry->hdr_head_id, \ + ##__VA_ARGS__) + +TRACE_EVENT(cxl_generic_event, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + const uuid_t *uuid, struct cxl_event_generic *gen_rec), + + TP_ARGS(cxlmd, log, uuid, gen_rec), + + TP_STRUCT__entry( + CXL_EVT_TP_entry + __array(u8, data, CXL_EVENT_RECORD_DATA_LENGTH) + ), + + TP_fast_assign( + CXL_EVT_TP_fast_assign(cxlmd, log, gen_rec->hdr); + memcpy(&__entry->hdr_uuid, uuid, sizeof(uuid_t)); + memcpy(__entry->data, gen_rec->data, CXL_EVENT_RECORD_DATA_LENGTH); + ), + + CXL_EVT_TP_printk("%s", + __print_hex(__entry->data, CXL_EVENT_RECORD_DATA_LENGTH)) +); + +/* + * Physical Address field masks + * + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + * + * DRAM Event Record + * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44 + */ +#define CXL_DPA_FLAGS_MASK GENMASK(1, 0) +#define CXL_DPA_MASK GENMASK_ULL(63, 6) + +#define CXL_DPA_VOLATILE BIT(0) +#define CXL_DPA_NOT_REPAIRABLE BIT(1) +#define show_dpa_flags(flags) __print_flags(flags, "|", \ + { CXL_DPA_VOLATILE, "VOLATILE" }, \ + { CXL_DPA_NOT_REPAIRABLE, "NOT_REPAIRABLE" } \ +) + +/* + * Component ID Format + * CXL 3.1 section 8.2.9.2.1; Table 8-44 + */ +#define CXL_PLDM_COMPONENT_ID_ENTITY_VALID BIT(0) +#define CXL_PLDM_COMPONENT_ID_RES_VALID BIT(1) + +#define show_comp_id_pldm_flags(flags) __print_flags(flags, " | ", \ + { CXL_PLDM_COMPONENT_ID_ENTITY_VALID, "PLDM Entity ID" }, \ + { CXL_PLDM_COMPONENT_ID_RES_VALID, "Resource ID" } \ +) + +#define show_pldm_entity_id(flags, valid_comp_id, valid_id_format, comp_id) \ + (flags & valid_comp_id && flags & valid_id_format) ? \ + (comp_id[0] & CXL_PLDM_COMPONENT_ID_ENTITY_VALID) ? \ + __print_hex(&comp_id[1], 6) : "0x00" : "0x00" + +#define show_pldm_resource_id(flags, valid_comp_id, valid_id_format, comp_id) \ + (flags & valid_comp_id && flags & valid_id_format) ? \ + (comp_id[0] & CXL_PLDM_COMPONENT_ID_RES_VALID) ? \ + __print_hex(&comp_id[7], 4) : "0x00" : "0x00" + +/* + * General Media Event Record - GMER + * CXL rev 3.1 Section 8.2.9.2.1.1; Table 8-45 + */ +#define CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT BIT(0) +#define CXL_GMER_EVT_DESC_THRESHOLD_EVENT BIT(1) +#define CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW BIT(2) +#define show_event_desc_flags(flags) __print_flags(flags, "|", \ + { CXL_GMER_EVT_DESC_UNCORECTABLE_EVENT, "UNCORRECTABLE_EVENT" }, \ + { CXL_GMER_EVT_DESC_THRESHOLD_EVENT, "THRESHOLD_EVENT" }, \ + { CXL_GMER_EVT_DESC_POISON_LIST_OVERFLOW, "POISON_LIST_OVERFLOW" } \ +) + +#define CXL_GMER_MEM_EVT_TYPE_ECC_ERROR 0x00 +#define CXL_GMER_MEM_EVT_TYPE_INV_ADDR 0x01 +#define CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR 0x02 +#define CXL_GMER_MEM_EVT_TYPE_TE_STATE_VIOLATION 0x03 +#define CXL_GMER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR 0x04 +#define CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE 0x05 +#define CXL_GMER_MEM_EVT_TYPE_CKID_VIOLATION 0x06 +#define show_gmer_mem_event_type(type) __print_symbolic(type, \ + { CXL_GMER_MEM_EVT_TYPE_ECC_ERROR, "ECC Error" }, \ + { CXL_GMER_MEM_EVT_TYPE_INV_ADDR, "Invalid Address" }, \ + { CXL_GMER_MEM_EVT_TYPE_DATA_PATH_ERROR, "Data Path Error" }, \ + { CXL_GMER_MEM_EVT_TYPE_TE_STATE_VIOLATION, "TE State Violation" }, \ + { CXL_GMER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR, "Scrub Media ECC Error" }, \ + { CXL_GMER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE, "Adv Prog CME Counter Expiration" }, \ + { CXL_GMER_MEM_EVT_TYPE_CKID_VIOLATION, "CKID Violation" } \ +) + +#define CXL_GMER_TRANS_UNKNOWN 0x00 +#define CXL_GMER_TRANS_HOST_READ 0x01 +#define CXL_GMER_TRANS_HOST_WRITE 0x02 +#define CXL_GMER_TRANS_HOST_SCAN_MEDIA 0x03 +#define CXL_GMER_TRANS_HOST_INJECT_POISON 0x04 +#define CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB 0x05 +#define CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT 0x06 +#define CXL_GMER_TRANS_INTERNAL_MEDIA_ECS 0x07 +#define CXL_GMER_TRANS_MEDIA_INITIALIZATION 0x08 +#define show_trans_type(type) __print_symbolic(type, \ + { CXL_GMER_TRANS_UNKNOWN, "Unknown" }, \ + { CXL_GMER_TRANS_HOST_READ, "Host Read" }, \ + { CXL_GMER_TRANS_HOST_WRITE, "Host Write" }, \ + { CXL_GMER_TRANS_HOST_SCAN_MEDIA, "Host Scan Media" }, \ + { CXL_GMER_TRANS_HOST_INJECT_POISON, "Host Inject Poison" }, \ + { CXL_GMER_TRANS_INTERNAL_MEDIA_SCRUB, "Internal Media Scrub" }, \ + { CXL_GMER_TRANS_INTERNAL_MEDIA_MANAGEMENT, "Internal Media Management" }, \ + { CXL_GMER_TRANS_INTERNAL_MEDIA_ECS, "Internal Media Error Check Scrub" }, \ + { CXL_GMER_TRANS_MEDIA_INITIALIZATION, "Media Initialization" } \ +) + +#define CXL_GMER_VALID_CHANNEL BIT(0) +#define CXL_GMER_VALID_RANK BIT(1) +#define CXL_GMER_VALID_DEVICE BIT(2) +#define CXL_GMER_VALID_COMPONENT BIT(3) +#define CXL_GMER_VALID_COMPONENT_ID_FORMAT BIT(4) +#define show_valid_flags(flags) __print_flags(flags, "|", \ + { CXL_GMER_VALID_CHANNEL, "CHANNEL" }, \ + { CXL_GMER_VALID_RANK, "RANK" }, \ + { CXL_GMER_VALID_DEVICE, "DEVICE" }, \ + { CXL_GMER_VALID_COMPONENT, "COMPONENT" }, \ + { CXL_GMER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" } \ +) + +#define CXL_GMER_CME_EV_FLAG_CME_MULTIPLE_MEDIA BIT(0) +#define CXL_GMER_CME_EV_FLAG_THRESHOLD_EXCEEDED BIT(1) +#define show_cme_threshold_ev_flags(flags) __print_flags(flags, "|", \ + { \ + CXL_GMER_CME_EV_FLAG_CME_MULTIPLE_MEDIA, \ + "Corrected Memory Errors in Multiple Media Components" \ + }, { \ + CXL_GMER_CME_EV_FLAG_THRESHOLD_EXCEEDED, \ + "Exceeded Programmable Threshold" \ + } \ +) + +#define CXL_GMER_MEM_EVT_SUB_TYPE_NOT_REPORTED 0x00 +#define CXL_GMER_MEM_EVT_SUB_TYPE_INTERNAL_DATAPATH_ERROR 0x01 +#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_COMMAND_TRAINING_ERROR 0x02 +#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CONTROL_TRAINING_ERROR 0x03 +#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_DATA_TRAINING_ERROR 0x04 +#define CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CRC_ERROR 0x05 +#define show_mem_event_sub_type(sub_type) __print_symbolic(sub_type, \ + { CXL_GMER_MEM_EVT_SUB_TYPE_NOT_REPORTED, "Not Reported" }, \ + { CXL_GMER_MEM_EVT_SUB_TYPE_INTERNAL_DATAPATH_ERROR, "Internal Datapath Error" }, \ + { \ + CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_COMMAND_TRAINING_ERROR, \ + "Media Link Command Training Error" \ + }, { \ + CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CONTROL_TRAINING_ERROR, \ + "Media Link Control Training Error" \ + }, { \ + CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_DATA_TRAINING_ERROR, \ + "Media Link Data Training Error" \ + }, { \ + CXL_GMER_MEM_EVT_SUB_TYPE_MEDIA_LINK_CRC_ERROR, "Media Link CRC Error" \ + } \ +) + +TRACE_EVENT(cxl_general_media, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_gen_media *rec), + + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), + + TP_STRUCT__entry( + CXL_EVT_TP_entry + /* General Media */ + __field(u64, dpa) + __field(u8, descriptor) + __field(u8, type) + __field(u8, transaction_type) + __field(u8, channel) + __field(u32, device) + __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) + /* Following are out of order to pack trace record */ + __field(u64, hpa) + __field(u64, hpa_alias0) + __field_struct(uuid_t, region_uuid) + __field(u16, validity_flags) + __field(u8, rank) + __field(u8, dpa_flags) + __field(u32, cme_count) + __field(u8, sub_type) + __field(u8, cme_threshold_ev_flags) + __string(region_name, cxlr ? dev_name(&cxlr->dev) : "") + ), + + TP_fast_assign( + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); + __entry->hdr_uuid = CXL_EVENT_GEN_MEDIA_UUID; + + /* General Media */ + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); + __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; + /* Mask after flags have been parsed */ + __entry->dpa &= CXL_DPA_MASK; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->sub_type = rec->sub_type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; + __entry->device = get_unaligned_le24(rec->device); + memcpy(__entry->comp_id, &rec->component_id, + CXL_EVENT_GEN_MED_COMP_ID_SIZE); + __entry->validity_flags = get_unaligned_le16(&rec->media_hdr.validity_flags); + __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; + if (cxlr) { + __assign_str(region_name); + uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); + } else { + __assign_str(region_name); + uuid_copy(&__entry->region_uuid, &uuid_null); + } + __entry->cme_threshold_ev_flags = rec->cme_threshold_ev_flags; + if (rec->media_hdr.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) + __entry->cme_count = get_unaligned_le24(rec->cme_count); + else + __entry->cme_count = 0; + ), + + CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' " \ + "descriptor='%s' type='%s' sub_type='%s' " \ + "transaction_type='%s' channel=%u rank=%u " \ + "device=%x validity_flags='%s' " \ + "comp_id=%s comp_id_pldm_valid_flags='%s' " \ + "pldm_entity_id=%s pldm_resource_id=%s " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ + "cme_threshold_ev_flags='%s' cme_count=%u", + __entry->dpa, show_dpa_flags(__entry->dpa_flags), + show_event_desc_flags(__entry->descriptor), + show_gmer_mem_event_type(__entry->type), + show_mem_event_sub_type(__entry->sub_type), + show_trans_type(__entry->transaction_type), + __entry->channel, __entry->rank, __entry->device, + show_valid_flags(__entry->validity_flags), + __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE), + show_comp_id_pldm_flags(__entry->comp_id[0]), + show_pldm_entity_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT, + CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + show_pldm_resource_id(__entry->validity_flags, CXL_GMER_VALID_COMPONENT, + CXL_GMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, + show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), __entry->cme_count + ) +); + +/* + * DRAM Event Record - DER + * + * CXL rev 3.1 section 8.2.9.2.1.2; Table 8-46 + */ +/* + * DRAM Event Record defines many fields the same as the General Media Event + * Record. Reuse those definitions as appropriate. + */ +#define CXL_DER_MEM_EVT_TYPE_ECC_ERROR 0x00 +#define CXL_DER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR 0x01 +#define CXL_DER_MEM_EVT_TYPE_INV_ADDR 0x02 +#define CXL_DER_MEM_EVT_TYPE_DATA_PATH_ERROR 0x03 +#define CXL_DER_MEM_EVT_TYPE_TE_STATE_VIOLATION 0x04 +#define CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE 0x05 +#define CXL_DER_MEM_EVT_TYPE_CKID_VIOLATION 0x06 +#define show_dram_mem_event_type(type) __print_symbolic(type, \ + { CXL_DER_MEM_EVT_TYPE_ECC_ERROR, "ECC Error" }, \ + { CXL_DER_MEM_EVT_TYPE_SCRUB_MEDIA_ECC_ERROR, "Scrub Media ECC Error" }, \ + { CXL_DER_MEM_EVT_TYPE_INV_ADDR, "Invalid Address" }, \ + { CXL_DER_MEM_EVT_TYPE_DATA_PATH_ERROR, "Data Path Error" }, \ + { CXL_DER_MEM_EVT_TYPE_TE_STATE_VIOLATION, "TE State Violation" }, \ + { CXL_DER_MEM_EVT_TYPE_AP_CME_COUNTER_EXPIRE, "Adv Prog CME Counter Expiration" }, \ + { CXL_DER_MEM_EVT_TYPE_CKID_VIOLATION, "CKID Violation" } \ +) + +#define CXL_DER_VALID_CHANNEL BIT(0) +#define CXL_DER_VALID_RANK BIT(1) +#define CXL_DER_VALID_NIBBLE BIT(2) +#define CXL_DER_VALID_BANK_GROUP BIT(3) +#define CXL_DER_VALID_BANK BIT(4) +#define CXL_DER_VALID_ROW BIT(5) +#define CXL_DER_VALID_COLUMN BIT(6) +#define CXL_DER_VALID_CORRECTION_MASK BIT(7) +#define CXL_DER_VALID_COMPONENT BIT(8) +#define CXL_DER_VALID_COMPONENT_ID_FORMAT BIT(9) +#define CXL_DER_VALID_SUB_CHANNEL BIT(10) +#define show_dram_valid_flags(flags) __print_flags(flags, "|", \ + { CXL_DER_VALID_CHANNEL, "CHANNEL" }, \ + { CXL_DER_VALID_RANK, "RANK" }, \ + { CXL_DER_VALID_NIBBLE, "NIBBLE" }, \ + { CXL_DER_VALID_BANK_GROUP, "BANK GROUP" }, \ + { CXL_DER_VALID_BANK, "BANK" }, \ + { CXL_DER_VALID_ROW, "ROW" }, \ + { CXL_DER_VALID_COLUMN, "COLUMN" }, \ + { CXL_DER_VALID_CORRECTION_MASK, "CORRECTION MASK" }, \ + { CXL_DER_VALID_COMPONENT, "COMPONENT" }, \ + { CXL_DER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" }, \ + { CXL_DER_VALID_SUB_CHANNEL, "SUB CHANNEL" } \ +) + +TRACE_EVENT(cxl_dram, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + struct cxl_region *cxlr, u64 hpa, u64 hpa_alias0, + struct cxl_event_dram *rec), + + TP_ARGS(cxlmd, log, cxlr, hpa, hpa_alias0, rec), + + TP_STRUCT__entry( + CXL_EVT_TP_entry + /* DRAM */ + __field(u64, dpa) + __field(u8, descriptor) + __field(u8, type) + __field(u8, transaction_type) + __field(u8, channel) + __field(u16, validity_flags) + __field(u16, column) /* Out of order to pack trace record */ + __field(u32, nibble_mask) + __field(u32, row) + __array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE) + __field(u64, hpa) + __field(u64, hpa_alias0) + __field_struct(uuid_t, region_uuid) + __field(u8, rank) /* Out of order to pack trace record */ + __field(u8, bank_group) /* Out of order to pack trace record */ + __field(u8, bank) /* Out of order to pack trace record */ + __field(u8, dpa_flags) /* Out of order to pack trace record */ + /* Following are out of order to pack trace record */ + __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) + __field(u32, cvme_count) + __field(u8, sub_type) + __field(u8, sub_channel) + __field(u8, cme_threshold_ev_flags) + __string(region_name, cxlr ? dev_name(&cxlr->dev) : "") + ), + + TP_fast_assign( + CXL_EVT_TP_fast_assign(cxlmd, log, rec->media_hdr.hdr); + __entry->hdr_uuid = CXL_EVENT_DRAM_UUID; + + /* DRAM */ + __entry->dpa = le64_to_cpu(rec->media_hdr.phys_addr); + __entry->dpa_flags = __entry->dpa & CXL_DPA_FLAGS_MASK; + __entry->dpa &= CXL_DPA_MASK; + __entry->descriptor = rec->media_hdr.descriptor; + __entry->type = rec->media_hdr.type; + __entry->sub_type = rec->sub_type; + __entry->transaction_type = rec->media_hdr.transaction_type; + __entry->validity_flags = get_unaligned_le16(rec->media_hdr.validity_flags); + __entry->channel = rec->media_hdr.channel; + __entry->rank = rec->media_hdr.rank; + __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask); + __entry->bank_group = rec->bank_group; + __entry->bank = rec->bank; + __entry->row = get_unaligned_le24(rec->row); + __entry->column = get_unaligned_le16(rec->column); + memcpy(__entry->cor_mask, &rec->correction_mask, + CXL_EVENT_DER_CORRECTION_MASK_SIZE); + __entry->hpa = hpa; + __entry->hpa_alias0 = hpa_alias0; + if (cxlr) { + __assign_str(region_name); + uuid_copy(&__entry->region_uuid, &cxlr->params.uuid); + } else { + __assign_str(region_name); + uuid_copy(&__entry->region_uuid, &uuid_null); + } + memcpy(__entry->comp_id, &rec->component_id, + CXL_EVENT_GEN_MED_COMP_ID_SIZE); + __entry->sub_channel = rec->sub_channel; + __entry->cme_threshold_ev_flags = rec->cme_threshold_ev_flags; + if (rec->media_hdr.descriptor & CXL_GMER_EVT_DESC_THRESHOLD_EVENT) + __entry->cvme_count = get_unaligned_le24(rec->cvme_count); + else + __entry->cvme_count = 0; + ), + + CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' descriptor='%s' type='%s' sub_type='%s' " \ + "transaction_type='%s' channel=%u rank=%u nibble_mask=%x " \ + "bank_group=%u bank=%u row=%u column=%u cor_mask=%s " \ + "validity_flags='%s' " \ + "comp_id=%s comp_id_pldm_valid_flags='%s' " \ + "pldm_entity_id=%s pldm_resource_id=%s " \ + "hpa=%llx hpa_alias0=%llx region=%s region_uuid=%pUb " \ + "sub_channel=%u cme_threshold_ev_flags='%s' cvme_count=%u", + __entry->dpa, show_dpa_flags(__entry->dpa_flags), + show_event_desc_flags(__entry->descriptor), + show_dram_mem_event_type(__entry->type), + show_mem_event_sub_type(__entry->sub_type), + show_trans_type(__entry->transaction_type), + __entry->channel, __entry->rank, __entry->nibble_mask, + __entry->bank_group, __entry->bank, + __entry->row, __entry->column, + __print_hex(__entry->cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE), + show_dram_valid_flags(__entry->validity_flags), + __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE), + show_comp_id_pldm_flags(__entry->comp_id[0]), + show_pldm_entity_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT, + CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + show_pldm_resource_id(__entry->validity_flags, CXL_DER_VALID_COMPONENT, + CXL_DER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + __entry->hpa, __entry->hpa_alias0, __get_str(region_name), &__entry->region_uuid, + __entry->sub_channel, show_cme_threshold_ev_flags(__entry->cme_threshold_ev_flags), + __entry->cvme_count + ) +); + +/* + * Memory Module Event Record - MMER + * + * CXL res 3.1 section 8.2.9.2.1.3; Table 8-47 + */ +#define CXL_MMER_HEALTH_STATUS_CHANGE 0x00 +#define CXL_MMER_MEDIA_STATUS_CHANGE 0x01 +#define CXL_MMER_LIFE_USED_CHANGE 0x02 +#define CXL_MMER_TEMP_CHANGE 0x03 +#define CXL_MMER_DATA_PATH_ERROR 0x04 +#define CXL_MMER_LSA_ERROR 0x05 +#define CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR 0x06 +#define CXL_MMER_MEMORY_MEDIA_FRU_ERROR 0x07 +#define CXL_MMER_POWER_MANAGEMENT_FAULT 0x08 +#define show_dev_evt_type(type) __print_symbolic(type, \ + { CXL_MMER_HEALTH_STATUS_CHANGE, "Health Status Change" }, \ + { CXL_MMER_MEDIA_STATUS_CHANGE, "Media Status Change" }, \ + { CXL_MMER_LIFE_USED_CHANGE, "Life Used Change" }, \ + { CXL_MMER_TEMP_CHANGE, "Temperature Change" }, \ + { CXL_MMER_DATA_PATH_ERROR, "Data Path Error" }, \ + { CXL_MMER_LSA_ERROR, "LSA Error" }, \ + { CXL_MMER_UNRECOV_SIDEBAND_BUS_ERROR, "Unrecoverable Internal Sideband Bus Error" }, \ + { CXL_MMER_MEMORY_MEDIA_FRU_ERROR, "Memory Media FRU Error" }, \ + { CXL_MMER_POWER_MANAGEMENT_FAULT, "Power Management Fault" } \ +) + +/* + * Device Health Information - DHI + * + * CXL res 3.1 section 8.2.9.9.3.1; Table 8-133 + */ +#define CXL_DHI_HS_MAINTENANCE_NEEDED BIT(0) +#define CXL_DHI_HS_PERFORMANCE_DEGRADED BIT(1) +#define CXL_DHI_HS_HW_REPLACEMENT_NEEDED BIT(2) +#define CXL_DHI_HS_MEM_CAPACITY_DEGRADED BIT(3) +#define show_health_status_flags(flags) __print_flags(flags, "|", \ + { CXL_DHI_HS_MAINTENANCE_NEEDED, "MAINTENANCE_NEEDED" }, \ + { CXL_DHI_HS_PERFORMANCE_DEGRADED, "PERFORMANCE_DEGRADED" }, \ + { CXL_DHI_HS_HW_REPLACEMENT_NEEDED, "REPLACEMENT_NEEDED" }, \ + { CXL_DHI_HS_MEM_CAPACITY_DEGRADED, "MEM_CAPACITY_DEGRADED" } \ +) + +#define CXL_DHI_MS_NORMAL 0x00 +#define CXL_DHI_MS_NOT_READY 0x01 +#define CXL_DHI_MS_WRITE_PERSISTENCY_LOST 0x02 +#define CXL_DHI_MS_ALL_DATA_LOST 0x03 +#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS 0x04 +#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN 0x05 +#define CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT 0x06 +#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS 0x07 +#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN 0x08 +#define CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT 0x09 +#define show_media_status(ms) __print_symbolic(ms, \ + { CXL_DHI_MS_NORMAL, \ + "Normal" }, \ + { CXL_DHI_MS_NOT_READY, \ + "Not Ready" }, \ + { CXL_DHI_MS_WRITE_PERSISTENCY_LOST, \ + "Write Persistency Lost" }, \ + { CXL_DHI_MS_ALL_DATA_LOST, \ + "All Data Lost" }, \ + { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_POWER_LOSS, \ + "Write Persistency Loss in the Event of Power Loss" }, \ + { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_EVENT_SHUTDOWN, \ + "Write Persistency Loss in Event of Shutdown" }, \ + { CXL_DHI_MS_WRITE_PERSISTENCY_LOSS_IMMINENT, \ + "Write Persistency Loss Imminent" }, \ + { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_POWER_LOSS, \ + "All Data Loss in Event of Power Loss" }, \ + { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_EVENT_SHUTDOWN, \ + "All Data loss in the Event of Shutdown" }, \ + { CXL_DHI_MS_WRITE_ALL_DATA_LOSS_IMMINENT, \ + "All Data Loss Imminent" } \ +) + +#define CXL_DHI_AS_NORMAL 0x0 +#define CXL_DHI_AS_WARNING 0x1 +#define CXL_DHI_AS_CRITICAL 0x2 +#define show_two_bit_status(as) __print_symbolic(as, \ + { CXL_DHI_AS_NORMAL, "Normal" }, \ + { CXL_DHI_AS_WARNING, "Warning" }, \ + { CXL_DHI_AS_CRITICAL, "Critical" } \ +) +#define show_one_bit_status(as) __print_symbolic(as, \ + { CXL_DHI_AS_NORMAL, "Normal" }, \ + { CXL_DHI_AS_WARNING, "Warning" } \ +) + +#define CXL_DHI_AS_LIFE_USED(as) (as & 0x3) +#define CXL_DHI_AS_DEV_TEMP(as) ((as & 0xC) >> 2) +#define CXL_DHI_AS_COR_VOL_ERR_CNT(as) ((as & 0x10) >> 4) +#define CXL_DHI_AS_COR_PER_ERR_CNT(as) ((as & 0x20) >> 5) + +#define CXL_MMER_VALID_COMPONENT BIT(0) +#define CXL_MMER_VALID_COMPONENT_ID_FORMAT BIT(1) +#define show_mem_module_valid_flags(flags) __print_flags(flags, "|", \ + { CXL_MMER_VALID_COMPONENT, "COMPONENT" }, \ + { CXL_MMER_VALID_COMPONENT_ID_FORMAT, "COMPONENT PLDM FORMAT" } \ +) +#define CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED 0x00 +#define CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA 0x01 +#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA 0x02 +#define CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU 0x03 +#define show_dev_event_sub_type(sub_type) __print_symbolic(sub_type, \ + { CXL_MMER_DEV_EVT_SUB_TYPE_NOT_REPORTED, "Not Reported" }, \ + { CXL_MMER_DEV_EVT_SUB_TYPE_INVALID_CONFIG_DATA, "Invalid Config Data" }, \ + { CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_CONFIG_DATA, "Unsupported Config Data" }, \ + { \ + CXL_MMER_DEV_EVT_SUB_TYPE_UNSUPP_MEM_MEDIA_FRU, \ + "Unsupported Memory Media FRU" \ + } \ +) + +TRACE_EVENT(cxl_memory_module, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + struct cxl_event_mem_module *rec), + + TP_ARGS(cxlmd, log, rec), + + TP_STRUCT__entry( + CXL_EVT_TP_entry + + /* Memory Module Event */ + __field(u8, event_type) + + /* Device Health Info */ + __field(u8, health_status) + __field(u8, media_status) + __field(u8, life_used) + __field(u32, dirty_shutdown_cnt) + __field(u32, cor_vol_err_cnt) + __field(u32, cor_per_err_cnt) + __field(s16, device_temp) + __field(u8, add_status) + __field(u8, event_sub_type) + __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) + __field(u16, validity_flags) + ), + + TP_fast_assign( + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + __entry->hdr_uuid = CXL_EVENT_MEM_MODULE_UUID; + + /* Memory Module Event */ + __entry->event_type = rec->event_type; + __entry->event_sub_type = rec->event_sub_type; + + /* Device Health Info */ + __entry->health_status = rec->info.health_status; + __entry->media_status = rec->info.media_status; + __entry->life_used = rec->info.life_used; + __entry->dirty_shutdown_cnt = get_unaligned_le32(rec->info.dirty_shutdown_cnt); + __entry->cor_vol_err_cnt = get_unaligned_le32(rec->info.cor_vol_err_cnt); + __entry->cor_per_err_cnt = get_unaligned_le32(rec->info.cor_per_err_cnt); + __entry->device_temp = get_unaligned_le16(rec->info.device_temp); + __entry->add_status = rec->info.add_status; + __entry->validity_flags = get_unaligned_le16(rec->validity_flags); + memcpy(__entry->comp_id, &rec->component_id, + CXL_EVENT_GEN_MED_COMP_ID_SIZE); + ), + + CXL_EVT_TP_printk("event_type='%s' event_sub_type='%s' health_status='%s' " \ + "media_status='%s' as_life_used=%s as_dev_temp=%s as_cor_vol_err_cnt=%s " \ + "as_cor_per_err_cnt=%s life_used=%u device_temp=%d " \ + "dirty_shutdown_cnt=%u cor_vol_err_cnt=%u cor_per_err_cnt=%u " \ + "validity_flags='%s' " \ + "comp_id=%s comp_id_pldm_valid_flags='%s' " \ + "pldm_entity_id=%s pldm_resource_id=%s", + show_dev_evt_type(__entry->event_type), + show_dev_event_sub_type(__entry->event_sub_type), + show_health_status_flags(__entry->health_status), + show_media_status(__entry->media_status), + show_two_bit_status(CXL_DHI_AS_LIFE_USED(__entry->add_status)), + show_two_bit_status(CXL_DHI_AS_DEV_TEMP(__entry->add_status)), + show_one_bit_status(CXL_DHI_AS_COR_VOL_ERR_CNT(__entry->add_status)), + show_one_bit_status(CXL_DHI_AS_COR_PER_ERR_CNT(__entry->add_status)), + __entry->life_used, __entry->device_temp, + __entry->dirty_shutdown_cnt, __entry->cor_vol_err_cnt, + __entry->cor_per_err_cnt, + show_mem_module_valid_flags(__entry->validity_flags), + __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE), + show_comp_id_pldm_flags(__entry->comp_id[0]), + show_pldm_entity_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT, + CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + show_pldm_resource_id(__entry->validity_flags, CXL_MMER_VALID_COMPONENT, + CXL_MMER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id) + ) +); + +/* + * Memory Sparing Event Record - MSER + * + * CXL rev 3.2 section 8.2.10.2.1.4; Table 8-60 + */ +#define CXL_MSER_QUERY_RESOURCE_FLAG BIT(0) +#define CXL_MSER_HARD_SPARING_FLAG BIT(1) +#define CXL_MSER_DEV_INITED_FLAG BIT(2) +#define show_mem_sparing_flags(flags) __print_flags(flags, "|", \ + { CXL_MSER_QUERY_RESOURCE_FLAG, "Query Resources" }, \ + { CXL_MSER_HARD_SPARING_FLAG, "Hard Sparing" }, \ + { CXL_MSER_DEV_INITED_FLAG, "Device Initiated Sparing" } \ +) + +#define CXL_MSER_VALID_CHANNEL BIT(0) +#define CXL_MSER_VALID_RANK BIT(1) +#define CXL_MSER_VALID_NIBBLE BIT(2) +#define CXL_MSER_VALID_BANK_GROUP BIT(3) +#define CXL_MSER_VALID_BANK BIT(4) +#define CXL_MSER_VALID_ROW BIT(5) +#define CXL_MSER_VALID_COLUMN BIT(6) +#define CXL_MSER_VALID_COMPONENT_ID BIT(7) +#define CXL_MSER_VALID_COMPONENT_ID_FORMAT BIT(8) +#define CXL_MSER_VALID_SUB_CHANNEL BIT(9) +#define show_mem_sparing_valid_flags(flags) __print_flags(flags, "|", \ + { CXL_MSER_VALID_CHANNEL, "CHANNEL" }, \ + { CXL_MSER_VALID_RANK, "RANK" }, \ + { CXL_MSER_VALID_NIBBLE, "NIBBLE" }, \ + { CXL_MSER_VALID_BANK_GROUP, "BANK GROUP" }, \ + { CXL_MSER_VALID_BANK, "BANK" }, \ + { CXL_MSER_VALID_ROW, "ROW" }, \ + { CXL_MSER_VALID_COLUMN, "COLUMN" }, \ + { CXL_MSER_VALID_COMPONENT_ID, "COMPONENT ID" }, \ + { CXL_MSER_VALID_COMPONENT_ID_FORMAT, "COMPONENT ID PLDM FORMAT" }, \ + { CXL_MSER_VALID_SUB_CHANNEL, "SUB CHANNEL" } \ +) + +TRACE_EVENT(cxl_memory_sparing, + + TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log, + struct cxl_event_mem_sparing *rec), + + TP_ARGS(cxlmd, log, rec), + + TP_STRUCT__entry( + CXL_EVT_TP_entry + + /* Memory Sparing Event */ + __field(u8, flags) + __field(u8, result) + __field(u16, validity_flags) + __field(u16, res_avail) + __field(u8, channel) + __field(u8, rank) + __field(u32, nibble_mask) + __field(u8, bank_group) + __field(u8, bank) + __field(u32, row) + __field(u16, column) + __field(u8, sub_channel) + __array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE) + ), + + TP_fast_assign( + CXL_EVT_TP_fast_assign(cxlmd, log, rec->hdr); + __entry->hdr_uuid = CXL_EVENT_MEM_SPARING_UUID; + + /* Memory Sparing Event */ + __entry->flags = rec->flags; + __entry->result = rec->result; + __entry->validity_flags = le16_to_cpu(rec->validity_flags); + __entry->res_avail = le16_to_cpu(rec->res_avail); + __entry->channel = rec->channel; + __entry->rank = rec->rank; + __entry->nibble_mask = get_unaligned_le24(rec->nibble_mask); + __entry->bank_group = rec->bank_group; + __entry->bank = rec->bank; + __entry->row = get_unaligned_le24(rec->row); + __entry->column = le16_to_cpu(rec->column); + __entry->sub_channel = rec->sub_channel; + memcpy(__entry->comp_id, &rec->component_id, + CXL_EVENT_GEN_MED_COMP_ID_SIZE); + ), + + CXL_EVT_TP_printk("flags='%s' result=%u validity_flags='%s' " \ + "spare resource avail=%u channel=%u rank=%u " \ + "nibble_mask=%x bank_group=%u bank=%u " \ + "row=%u column=%u sub_channel=%u " \ + "comp_id=%s comp_id_pldm_valid_flags='%s' " \ + "pldm_entity_id=%s pldm_resource_id=%s", + show_mem_sparing_flags(__entry->flags), + __entry->result, + show_mem_sparing_valid_flags(__entry->validity_flags), + __entry->res_avail, __entry->channel, __entry->rank, + __entry->nibble_mask, __entry->bank_group, __entry->bank, + __entry->row, __entry->column, __entry->sub_channel, + __print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE), + show_comp_id_pldm_flags(__entry->comp_id[0]), + show_pldm_entity_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID, + CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id), + show_pldm_resource_id(__entry->validity_flags, CXL_MSER_VALID_COMPONENT_ID, + CXL_MSER_VALID_COMPONENT_ID_FORMAT, __entry->comp_id) + ) +); + +#define show_poison_trace_type(type) \ + __print_symbolic(type, \ + { CXL_POISON_TRACE_LIST, "List" }, \ + { CXL_POISON_TRACE_INJECT, "Inject" }, \ + { CXL_POISON_TRACE_CLEAR, "Clear" }) + +#define __show_poison_source(source) \ + __print_symbolic(source, \ + { CXL_POISON_SOURCE_UNKNOWN, "Unknown" }, \ + { CXL_POISON_SOURCE_EXTERNAL, "External" }, \ + { CXL_POISON_SOURCE_INTERNAL, "Internal" }, \ + { CXL_POISON_SOURCE_INJECTED, "Injected" }, \ + { CXL_POISON_SOURCE_VENDOR, "Vendor" }) + +#define show_poison_source(source) \ + (((source > CXL_POISON_SOURCE_INJECTED) && \ + (source != CXL_POISON_SOURCE_VENDOR)) ? "Reserved" \ + : __show_poison_source(source)) + +#define show_poison_flags(flags) \ + __print_flags(flags, "|", \ + { CXL_POISON_FLAG_MORE, "More" }, \ + { CXL_POISON_FLAG_OVERFLOW, "Overflow" }, \ + { CXL_POISON_FLAG_SCANNING, "Scanning" }) + +#define __cxl_poison_addr(record) \ + (le64_to_cpu(record->address)) +#define cxl_poison_record_dpa(record) \ + (__cxl_poison_addr(record) & CXL_POISON_START_MASK) +#define cxl_poison_record_source(record) \ + (__cxl_poison_addr(record) & CXL_POISON_SOURCE_MASK) +#define cxl_poison_record_dpa_length(record) \ + (le32_to_cpu(record->length) * CXL_POISON_LEN_MULT) +#define cxl_poison_overflow(flags, time) \ + (flags & CXL_POISON_FLAG_OVERFLOW ? le64_to_cpu(time) : 0) + +TRACE_EVENT(cxl_poison, + + TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *cxlr, + const struct cxl_poison_record *record, u8 flags, + __le64 overflow_ts, enum cxl_poison_trace_type trace_type), + + TP_ARGS(cxlmd, cxlr, record, flags, overflow_ts, trace_type), + + TP_STRUCT__entry( + __string(memdev, dev_name(&cxlmd->dev)) + __string(host, dev_name(cxlmd->dev.parent)) + __field(u64, serial) + __field(u8, trace_type) + __string(region, cxlr ? dev_name(&cxlr->dev) : "") + __field(u64, overflow_ts) + __field(u64, hpa) + __field(u64, hpa_alias0) + __field(u64, dpa) + __field(u32, dpa_length) + __array(char, uuid, 16) + __field(u8, source) + __field(u8, flags) + ), + + TP_fast_assign( + __assign_str(memdev); + __assign_str(host); + __entry->serial = cxlmd->cxlds->serial; + __entry->overflow_ts = cxl_poison_overflow(flags, overflow_ts); + __entry->dpa = cxl_poison_record_dpa(record); + __entry->dpa_length = cxl_poison_record_dpa_length(record); + __entry->source = cxl_poison_record_source(record); + __entry->trace_type = trace_type; + __entry->flags = flags; + if (cxlr) { + __assign_str(region); + memcpy(__entry->uuid, &cxlr->params.uuid, 16); + __entry->hpa = cxl_dpa_to_hpa(cxlr, cxlmd, + __entry->dpa); + if (__entry->hpa != ULLONG_MAX && cxlr->params.cache_size) + __entry->hpa_alias0 = __entry->hpa - + cxlr->params.cache_size; + else + __entry->hpa_alias0 = ULLONG_MAX; + } else { + __assign_str(region); + memset(__entry->uuid, 0, 16); + __entry->hpa = ULLONG_MAX; + __entry->hpa_alias0 = ULLONG_MAX; + } + ), + + TP_printk("memdev=%s host=%s serial=%lld trace_type=%s region=%s " \ + "region_uuid=%pU hpa=0x%llx hpa_alias0=0x%llx dpa=0x%llx " \ + "dpa_length=0x%x source=%s flags=%s overflow_time=%llu", + __get_str(memdev), + __get_str(host), + __entry->serial, + show_poison_trace_type(__entry->trace_type), + __get_str(region), + __entry->uuid, + __entry->hpa, + __entry->hpa_alias0, + __entry->dpa, + __entry->dpa_length, + show_poison_source(__entry->source), + show_poison_flags(__entry->flags), + __entry->overflow_ts + ) +); + +#endif /* _CXL_EVENTS_H */ + +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> |
