summaryrefslogtreecommitdiff
path: root/drivers/cxl/core/region.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/cxl/core/region.c')
-rw-r--r--drivers/cxl/core/region.c2904
1 files changed, 2477 insertions, 427 deletions
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index f9ae5ad284ff..ae899f68551f 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2,11 +2,16 @@
/* Copyright(c) 2022 Intel Corporation. All rights reserved. */
#include <linux/memregion.h>
#include <linux/genalloc.h>
+#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/module.h>
+#include <linux/memory.h>
#include <linux/slab.h>
#include <linux/uuid.h>
+#include <linux/sort.h>
#include <linux/idr.h>
+#include <linux/memory-tiers.h>
+#include <linux/string_choices.h>
#include <cxlmem.h>
#include <cxl.h>
#include "core.h"
@@ -28,13 +33,115 @@
*/
/*
- * All changes to the interleave configuration occur with this lock held
- * for write.
+ * nodemask that sets per node when the access_coordinates for the node has
+ * been updated by the CXL memory hotplug notifier.
*/
-static DECLARE_RWSEM(cxl_region_rwsem);
+static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
static struct cxl_region *to_cxl_region(struct device *dev);
+#define __ACCESS_ATTR_RO(_level, _name) { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .show = _name##_access##_level##_show, \
+}
+
+#define ACCESS_DEVICE_ATTR_RO(level, name) \
+ struct device_attribute dev_attr_access##level##_##name = __ACCESS_ATTR_RO(level, name)
+
+#define ACCESS_ATTR_RO(level, attrib) \
+static ssize_t attrib##_access##level##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct cxl_region *cxlr = to_cxl_region(dev); \
+ \
+ if (cxlr->coord[level].attrib == 0) \
+ return -ENOENT; \
+ \
+ return sysfs_emit(buf, "%u\n", cxlr->coord[level].attrib); \
+} \
+static ACCESS_DEVICE_ATTR_RO(level, attrib)
+
+ACCESS_ATTR_RO(0, read_bandwidth);
+ACCESS_ATTR_RO(0, read_latency);
+ACCESS_ATTR_RO(0, write_bandwidth);
+ACCESS_ATTR_RO(0, write_latency);
+
+#define ACCESS_ATTR_DECLARE(level, attrib) \
+ (&dev_attr_access##level##_##attrib.attr)
+
+static struct attribute *access0_coordinate_attrs[] = {
+ ACCESS_ATTR_DECLARE(0, read_bandwidth),
+ ACCESS_ATTR_DECLARE(0, write_bandwidth),
+ ACCESS_ATTR_DECLARE(0, read_latency),
+ ACCESS_ATTR_DECLARE(0, write_latency),
+ NULL
+};
+
+ACCESS_ATTR_RO(1, read_bandwidth);
+ACCESS_ATTR_RO(1, read_latency);
+ACCESS_ATTR_RO(1, write_bandwidth);
+ACCESS_ATTR_RO(1, write_latency);
+
+static struct attribute *access1_coordinate_attrs[] = {
+ ACCESS_ATTR_DECLARE(1, read_bandwidth),
+ ACCESS_ATTR_DECLARE(1, write_bandwidth),
+ ACCESS_ATTR_DECLARE(1, read_latency),
+ ACCESS_ATTR_DECLARE(1, write_latency),
+ NULL
+};
+
+#define ACCESS_VISIBLE(level) \
+static umode_t cxl_region_access##level##_coordinate_visible( \
+ struct kobject *kobj, struct attribute *a, int n) \
+{ \
+ struct device *dev = kobj_to_dev(kobj); \
+ struct cxl_region *cxlr = to_cxl_region(dev); \
+ \
+ if (a == &dev_attr_access##level##_read_latency.attr && \
+ cxlr->coord[level].read_latency == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_write_latency.attr && \
+ cxlr->coord[level].write_latency == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_read_bandwidth.attr && \
+ cxlr->coord[level].read_bandwidth == 0) \
+ return 0; \
+ \
+ if (a == &dev_attr_access##level##_write_bandwidth.attr && \
+ cxlr->coord[level].write_bandwidth == 0) \
+ return 0; \
+ \
+ return a->mode; \
+}
+
+ACCESS_VISIBLE(0);
+ACCESS_VISIBLE(1);
+
+static const struct attribute_group cxl_region_access0_coordinate_group = {
+ .name = "access0",
+ .attrs = access0_coordinate_attrs,
+ .is_visible = cxl_region_access0_coordinate_visible,
+};
+
+static const struct attribute_group *get_cxl_region_access0_group(void)
+{
+ return &cxl_region_access0_coordinate_group;
+}
+
+static const struct attribute_group cxl_region_access1_coordinate_group = {
+ .name = "access1",
+ .attrs = access1_coordinate_attrs,
+ .is_visible = cxl_region_access1_coordinate_visible,
+};
+
+static const struct attribute_group *get_cxl_region_access1_group(void)
+{
+ return &cxl_region_access1_coordinate_group;
+}
+
static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -42,13 +149,12 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
struct cxl_region_params *p = &cxlr->params;
ssize_t rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
return rc;
- rc = sysfs_emit(buf, "%pUb\n", &p->uuid);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ if (cxlr->mode != CXL_PARTMODE_PMEM)
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%pUb\n", &p->uuid);
}
static int is_dup(struct device *match, void *data)
@@ -60,7 +166,7 @@ static int is_dup(struct device *match, void *data)
if (!is_cxl_region(match))
return 0;
- lockdep_assert_held(&cxl_region_rwsem);
+ lockdep_assert_held(&cxl_rwsem.region);
cxlr = to_cxl_region(match);
p = &cxlr->params;
@@ -90,27 +196,22 @@ static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
if (uuid_is_null(&temp))
return -EINVAL;
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &region_rwsem)))
return rc;
if (uuid_equal(&p->uuid, &temp))
- goto out;
+ return len;
- rc = -EBUSY;
if (p->state >= CXL_CONFIG_ACTIVE)
- goto out;
+ return -EBUSY;
rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
if (rc < 0)
- goto out;
+ return rc;
uuid_copy(&p->uuid, &temp);
-out:
- up_write(&cxl_region_rwsem);
- if (rc)
- return rc;
return len;
}
static DEVICE_ATTR_RW(uuid);
@@ -121,17 +222,51 @@ static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
return xa_load(&port->regions, (unsigned long)cxlr);
}
-static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
+{
+ if (!cpu_cache_has_invalidate_memregion()) {
+ if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
+ dev_info_once(
+ &cxlr->dev,
+ "Bypassing cpu_cache_invalidate_memregion() for testing!\n");
+ return 0;
+ }
+ dev_WARN(&cxlr->dev,
+ "Failed to synchronize CPU cache state\n");
+ return -ENXIO;
+ }
+
+ if (!cxlr->params.res)
+ return -ENXIO;
+ cpu_cache_invalidate_memregion(cxlr->params.res->start,
+ resource_size(cxlr->params.res));
+ return 0;
+}
+
+static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
{
struct cxl_region_params *p = &cxlr->params;
int i;
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return;
+
+ /*
+ * Before region teardown attempt to flush, evict any data cached for
+ * this region, or scream loudly about missing arch / platform support
+ * for CXL teardown.
+ */
+ cxl_region_invalidate_memregion(cxlr);
+
for (i = count - 1; i >= 0; i--) {
struct cxl_endpoint_decoder *cxled = p->targets[i];
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
struct cxl_port *iter = cxled_to_port(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_ep *ep;
- int rc;
+
+ if (cxlds->rcd)
+ goto endpoint_reset;
while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
iter = to_cxl_port(iter->dev.parent);
@@ -143,16 +278,33 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
- rc = cxld->reset(cxld);
- if (rc)
- return rc;
+ if (cxld->reset)
+ cxld->reset(cxld);
+ set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
- rc = cxled->cxld.reset(&cxled->cxld);
- if (rc)
- return rc;
+endpoint_reset:
+ cxled->cxld.reset(&cxled->cxld);
+ set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
+ /* all decoders associated with this region have been torn down */
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
+static int commit_decoder(struct cxl_decoder *cxld)
+{
+ struct cxl_switch_decoder *cxlsd = NULL;
+
+ if (cxld->commit)
+ return cxld->commit(cxld);
+
+ if (is_switch_decoder(&cxld->dev))
+ cxlsd = to_cxl_switch_decoder(&cxld->dev);
+
+ if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1,
+ "->commit() is required\n"))
+ return -ENXIO;
return 0;
}
@@ -174,8 +326,7 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr)
iter = to_cxl_port(iter->dev.parent)) {
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
- if (cxld->commit)
- rc = cxld->commit(cxld);
+ rc = commit_decoder(cxld);
if (rc)
break;
}
@@ -186,7 +337,8 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr)
iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
- cxld->reset(cxld);
+ if (cxld->reset)
+ cxld->reset(cxld);
}
cxled->cxld.reset(&cxled->cxld);
@@ -202,63 +354,105 @@ err:
return rc;
}
-static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t len)
+static int queue_reset(struct cxl_region *cxlr)
{
- struct cxl_region *cxlr = to_cxl_region(dev);
struct cxl_region_params *p = &cxlr->params;
- bool commit;
- ssize_t rc;
+ int rc;
- rc = kstrtobool(buf, &commit);
- if (rc)
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
return rc;
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
+ /* Already in the requested state? */
+ if (p->state < CXL_CONFIG_COMMIT)
+ return 0;
+
+ p->state = CXL_CONFIG_RESET_PENDING;
+
+ return 0;
+}
+
+static int __commit(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
return rc;
/* Already in the requested state? */
- if (commit && p->state >= CXL_CONFIG_COMMIT)
- goto out;
- if (!commit && p->state < CXL_CONFIG_COMMIT)
- goto out;
+ if (p->state >= CXL_CONFIG_COMMIT)
+ return 0;
/* Not ready to commit? */
- if (commit && p->state < CXL_CONFIG_ACTIVE) {
- rc = -ENXIO;
- goto out;
- }
+ if (p->state < CXL_CONFIG_ACTIVE)
+ return -ENXIO;
- if (commit)
- rc = cxl_region_decode_commit(cxlr);
- else {
- p->state = CXL_CONFIG_RESET_PENDING;
- up_write(&cxl_region_rwsem);
- device_release_driver(&cxlr->dev);
- down_write(&cxl_region_rwsem);
+ /*
+ * Invalidate caches before region setup to drop any speculative
+ * consumption of this address space
+ */
+ rc = cxl_region_invalidate_memregion(cxlr);
+ if (rc)
+ return rc;
- /*
- * The lock was dropped, so need to revalidate that the reset is
- * still pending.
- */
- if (p->state == CXL_CONFIG_RESET_PENDING)
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- }
+ rc = cxl_region_decode_commit(cxlr);
+ if (rc)
+ return rc;
+
+ p->state = CXL_CONFIG_COMMIT;
+
+ return 0;
+}
+static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ bool commit;
+ ssize_t rc;
+
+ rc = kstrtobool(buf, &commit);
if (rc)
- goto out;
+ return rc;
- if (commit)
- p->state = CXL_CONFIG_COMMIT;
- else if (p->state == CXL_CONFIG_RESET_PENDING)
- p->state = CXL_CONFIG_ACTIVE;
+ if (commit) {
+ rc = __commit(cxlr);
+ if (rc)
+ return rc;
+ return len;
+ }
-out:
- up_write(&cxl_region_rwsem);
+ if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
+ return -EPERM;
+ rc = queue_reset(cxlr);
if (rc)
return rc;
+
+ /*
+ * Unmap the region and depend the reset-pending state to ensure
+ * it does not go active again until post reset
+ */
+ device_release_driver(&cxlr->dev);
+
+ /*
+ * With the reset pending take cxl_rwsem.region unconditionally
+ * to ensure the reset gets handled before returning.
+ */
+ guard(rwsem_write)(&cxl_rwsem.region);
+
+ /*
+ * Revalidate that the reset is still pending in case another
+ * thread already handled this reset.
+ */
+ if (p->state == CXL_CONFIG_RESET_PENDING) {
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
+ p->state = CXL_CONFIG_ACTIVE;
+ }
+
return len;
}
@@ -269,41 +463,24 @@ static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
struct cxl_region_params *p = &cxlr->params;
ssize_t rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
- rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
}
static DEVICE_ATTR_RW(commit);
-static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
- int n)
-{
- struct device *dev = kobj_to_dev(kobj);
- struct cxl_region *cxlr = to_cxl_region(dev);
-
- if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM)
- return 0;
- return a->mode;
-}
-
static ssize_t interleave_ways_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct cxl_region *cxlr = to_cxl_region(dev);
struct cxl_region_params *p = &cxlr->params;
- ssize_t rc;
+ int rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
- rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%d\n", p->interleave_ways);
}
static const struct attribute_group *get_cxl_region_target_group(void);
@@ -324,12 +501,12 @@ static ssize_t interleave_ways_store(struct device *dev,
if (rc)
return rc;
- rc = ways_to_cxl(val, &iw);
+ rc = ways_to_eiw(val, &iw);
if (rc)
return rc;
/*
- * Even for x3, x9, and x12 interleaves the region interleave must be a
+ * Even for x3, x6, and x12 interleaves the region interleave must be a
* power of 2 multiple of the host bridge interleave.
*/
if (!is_power_of_2(val / cxld->interleave_ways) ||
@@ -338,23 +515,21 @@ static ssize_t interleave_ways_store(struct device *dev,
return -EINVAL;
}
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
return rc;
- if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
- rc = -EBUSY;
- goto out;
- }
+
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+ return -EBUSY;
save = p->interleave_ways;
p->interleave_ways = val;
rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
- if (rc)
+ if (rc) {
p->interleave_ways = save;
-out:
- up_write(&cxl_region_rwsem);
- if (rc)
return rc;
+ }
+
return len;
}
static DEVICE_ATTR_RW(interleave_ways);
@@ -365,15 +540,12 @@ static ssize_t interleave_granularity_show(struct device *dev,
{
struct cxl_region *cxlr = to_cxl_region(dev);
struct cxl_region_params *p = &cxlr->params;
- ssize_t rc;
+ int rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
- rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%d\n", p->interleave_granularity);
}
static ssize_t interleave_granularity_store(struct device *dev,
@@ -391,7 +563,7 @@ static ssize_t interleave_granularity_store(struct device *dev,
if (rc)
return rc;
- rc = granularity_to_cxl(val, &ig);
+ rc = granularity_to_eig(val, &ig);
if (rc)
return rc;
@@ -399,26 +571,22 @@ static ssize_t interleave_granularity_store(struct device *dev,
* When the host-bridge is interleaved, disallow region granularity !=
* root granularity. Regions with a granularity less than the root
* interleave result in needing multiple endpoints to support a single
- * slot in the interleave (possible to suport in the future). Regions
+ * slot in the interleave (possible to support in the future). Regions
* with a granularity greater than the root interleave result in invalid
* DPA translations (invalid to support).
*/
if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
return -EINVAL;
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
return rc;
- if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
- rc = -EBUSY;
- goto out;
- }
+
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
+ return -EBUSY;
p->interleave_granularity = val;
-out:
- up_write(&cxl_region_rwsem);
- if (rc)
- return rc;
+
return len;
}
static DEVICE_ATTR_RW(interleave_granularity);
@@ -429,28 +597,43 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
struct cxl_region *cxlr = to_cxl_region(dev);
struct cxl_region_params *p = &cxlr->params;
u64 resource = -1ULL;
- ssize_t rc;
+ int rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
+
if (p->res)
resource = p->res->start;
- rc = sysfs_emit(buf, "%#llx\n", resource);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%#llx\n", resource);
}
static DEVICE_ATTR_RO(resource);
+static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ const char *desc;
+
+ if (cxlr->mode == CXL_PARTMODE_RAM)
+ desc = "ram";
+ else if (cxlr->mode == CXL_PARTMODE_PMEM)
+ desc = "pmem";
+ else
+ desc = "";
+
+ return sysfs_emit(buf, "%s\n", desc);
+}
+static DEVICE_ATTR_RO(mode);
+
static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_region_params *p = &cxlr->params;
struct resource *res;
- u32 remainder = 0;
+ u64 remainder = 0;
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
/* Nothing to do... */
if (p->res && resource_size(p->res) == size)
@@ -465,18 +648,19 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
/* ways, granularity and uuid (if PMEM) need to be set before HPA */
if (!p->interleave_ways || !p->interleave_granularity ||
- (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid)))
+ (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
return -ENXIO;
- div_u64_rem(size, SZ_256M * p->interleave_ways, &remainder);
+ div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
if (remainder)
return -EINVAL;
res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
dev_name(&cxlr->dev));
if (IS_ERR(res)) {
- dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n",
- PTR_ERR(res));
+ dev_dbg(&cxlr->dev,
+ "HPA allocation error (%ld) for size:%pap in %s %pr\n",
+ PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res);
return PTR_ERR(res);
}
@@ -491,9 +675,14 @@ static void cxl_region_iomem_release(struct cxl_region *cxlr)
struct cxl_region_params *p = &cxlr->params;
if (device_is_registered(&cxlr->dev))
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
if (p->res) {
- remove_resource(p->res);
+ /*
+ * Autodiscovered regions may not have been able to insert their
+ * resource.
+ */
+ if (p->res->parent)
+ remove_resource(p->res);
kfree(p->res);
p->res = NULL;
}
@@ -503,7 +692,7 @@ static int free_hpa(struct cxl_region *cxlr)
{
struct cxl_region_params *p = &cxlr->params;
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
if (!p->res)
return 0;
@@ -527,15 +716,14 @@ static ssize_t size_store(struct device *dev, struct device_attribute *attr,
if (rc)
return rc;
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
return rc;
if (val)
rc = alloc_hpa(cxlr, val);
else
rc = free_hpa(cxlr);
- up_write(&cxl_region_rwsem);
if (rc)
return rc;
@@ -551,18 +739,30 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr,
u64 size = 0;
ssize_t rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
if (p->res)
size = resource_size(p->res);
- rc = sysfs_emit(buf, "%#llx\n", size);
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%#llx\n", size);
}
static DEVICE_ATTR_RW(size);
+static ssize_t extended_linear_cache_size_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ ssize_t rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
+ return rc;
+ return sysfs_emit(buf, "%#llx\n", p->cache_size);
+}
+static DEVICE_ATTR_RO(extended_linear_cache_size);
+
static struct attribute *cxl_region_attrs[] = {
&dev_attr_uuid.attr,
&dev_attr_commit.attr,
@@ -570,9 +770,35 @@ static struct attribute *cxl_region_attrs[] = {
&dev_attr_interleave_granularity.attr,
&dev_attr_resource.attr,
&dev_attr_size.attr,
+ &dev_attr_mode.attr,
+ &dev_attr_extended_linear_cache_size.attr,
NULL,
};
+static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
+ int n)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cxl_region *cxlr = to_cxl_region(dev);
+
+ /*
+ * Support tooling that expects to find a 'uuid' attribute for all
+ * regions regardless of mode.
+ */
+ if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
+ return 0444;
+
+ /*
+ * Don't display extended linear cache attribute if there is no
+ * extended linear cache.
+ */
+ if (a == &dev_attr_extended_linear_cache_size.attr &&
+ cxlr->params.cache_size == 0)
+ return 0;
+
+ return a->mode;
+}
+
static const struct attribute_group cxl_region_group = {
.attrs = cxl_region_attrs,
.is_visible = cxl_region_visible,
@@ -584,71 +810,167 @@ static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
struct cxl_endpoint_decoder *cxled;
int rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
if (pos >= p->interleave_ways) {
dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
p->interleave_ways);
- rc = -ENXIO;
- goto out;
+ return -ENXIO;
}
cxled = p->targets[pos];
if (!cxled)
- rc = sysfs_emit(buf, "\n");
- else
- rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
-out:
- up_read(&cxl_region_rwsem);
+ return sysfs_emit(buf, "\n");
+ return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
+}
- return rc;
+static int check_commit_order(struct device *dev, void *data)
+{
+ struct cxl_decoder *cxld = to_cxl_decoder(dev);
+
+ /*
+ * if port->commit_end is not the only free decoder, then out of
+ * order shutdown has occurred, block further allocations until
+ * that is resolved
+ */
+ if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0))
+ return -EBUSY;
+ return 0;
}
-static int match_free_decoder(struct device *dev, void *data)
+static int match_free_decoder(struct device *dev, const void *data)
{
+ struct cxl_port *port = to_cxl_port(dev->parent);
struct cxl_decoder *cxld;
- int *id = data;
+ int rc;
if (!is_switch_decoder(dev))
return 0;
cxld = to_cxl_decoder(dev);
- /* enforce ordered allocation */
- if (cxld->id != *id)
+ if (cxld->id != port->commit_end + 1)
return 0;
- if (!cxld->region)
- return 1;
+ if (cxld->region) {
+ dev_dbg(dev->parent,
+ "next decoder to commit (%s) is already reserved (%s)\n",
+ dev_name(dev), dev_name(&cxld->region->dev));
+ return 0;
+ }
- (*id)++;
+ rc = device_for_each_child_reverse_from(dev->parent, dev, NULL,
+ check_commit_order);
+ if (rc) {
+ dev_dbg(dev->parent,
+ "unable to allocate %s due to out of order shutdown\n",
+ dev_name(dev));
+ return 0;
+ }
+ return 1;
+}
+
+static bool spa_maps_hpa(const struct cxl_region_params *p,
+ const struct range *range)
+{
+ if (!p->res)
+ return false;
+
+ /*
+ * The extended linear cache region is constructed by a 1:1 ratio
+ * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
+ * CXL decoders at the high end of the SPA range.
+ */
+ return p->res->start + p->cache_size == range->start &&
+ p->res->end == range->end;
+}
+
+static int match_auto_decoder(struct device *dev, const void *data)
+{
+ const struct cxl_region_params *p = data;
+ struct cxl_decoder *cxld;
+ struct range *r;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ r = &cxld->hpa_range;
+
+ if (spa_maps_hpa(p, r))
+ return 1;
return 0;
}
-static struct cxl_decoder *cxl_region_find_decoder(struct cxl_port *port,
- struct cxl_region *cxlr)
+/**
+ * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region
+ * @port: a port in the ancestry of the endpoint implied by @cxled
+ * @cxled: endpoint decoder to be, or currently, mapped by @port
+ * @cxlr: region to establish, or validate, decode @port
+ *
+ * In the region creation path cxl_port_pick_region_decoder() is an
+ * allocator to find a free port. In the region assembly path, it is
+ * recalling the decoder that platform firmware picked for validation
+ * purposes.
+ *
+ * The result is recorded in a 'struct cxl_region_ref' in @port.
+ */
+static struct cxl_decoder *
+cxl_port_pick_region_decoder(struct cxl_port *port,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_region *cxlr)
{
struct device *dev;
- int id = 0;
- dev = device_find_child(&port->dev, &id, match_free_decoder);
+ if (port == cxled_to_port(cxled))
+ return &cxled->cxld;
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+ dev = device_find_child(&port->dev, &cxlr->params,
+ match_auto_decoder);
+ else
+ dev = device_find_child(&port->dev, NULL, match_free_decoder);
if (!dev)
return NULL;
/*
* This decoder is pinned registered as long as the endpoint decoder is
* registered, and endpoint decoder unregistration holds the
- * cxl_region_rwsem over unregister events, so no need to hold on to
+ * cxl_rwsem.region over unregister events, so no need to hold on to
* this extra reference.
*/
put_device(dev);
return to_cxl_decoder(dev);
}
-static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
- struct cxl_region *cxlr)
+static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter,
+ struct cxl_decoder *cxld)
+{
+ struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter);
+ struct cxl_decoder *cxld_iter = rr->decoder;
+
+ /*
+ * Allow the out of order assembly of auto-discovered regions.
+ * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders
+ * in HPA order. Confirm that the decoder with the lesser HPA
+ * starting address has the lesser id.
+ */
+ dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n",
+ dev_name(&cxld->dev), cxld->id,
+ dev_name(&cxld_iter->dev), cxld_iter->id);
+
+ if (cxld_iter->id > cxld->id)
+ return true;
+
+ return false;
+}
+
+static struct cxl_region_ref *
+alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_decoder *cxld)
{
struct cxl_region_params *p = &cxlr->params;
struct cxl_region_ref *cxl_rr, *iter;
@@ -658,16 +980,18 @@ static struct cxl_region_ref *alloc_region_ref(struct cxl_port *port,
xa_for_each(&port->regions, index, iter) {
struct cxl_region_params *ip = &iter->region->params;
- if (!ip->res)
+ if (!ip->res || ip->res->start < p->res->start)
continue;
- if (ip->res->start > p->res->start) {
- dev_dbg(&cxlr->dev,
- "%s: HPA order violation %s:%pr vs %pr\n",
- dev_name(&port->dev),
- dev_name(&iter->region->dev), ip->res, p->res);
- return ERR_PTR(-EBUSY);
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (auto_order_ok(port, iter->region, cxld))
+ continue;
}
+ dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n",
+ dev_name(&port->dev),
+ dev_name(&iter->region->dev), ip->res, p->res);
+
+ return ERR_PTR(-EBUSY);
}
cxl_rr = kzalloc(sizeof(*cxl_rr), GFP_KERNEL);
@@ -741,22 +1065,11 @@ static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
return 0;
}
-static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
- struct cxl_endpoint_decoder *cxled,
- struct cxl_region_ref *cxl_rr)
+static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ struct cxl_region_ref *cxl_rr,
+ struct cxl_decoder *cxld)
{
- struct cxl_decoder *cxld;
-
- if (port == cxled_to_port(cxled))
- cxld = &cxled->cxld;
- else
- cxld = cxl_region_find_decoder(port, cxlr);
- if (!cxld) {
- dev_dbg(&cxlr->dev, "%s: no decoder available\n",
- dev_name(&port->dev));
- return -EBUSY;
- }
-
if (cxld->region) {
dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
dev_name(&port->dev), dev_name(&cxld->dev),
@@ -764,10 +1077,32 @@ static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr,
return -EBUSY;
}
+ /*
+ * Endpoints should already match the region type, but backstop that
+ * assumption with an assertion. Switch-decoders change mapping-type
+ * based on what is mapped when they are assigned to a region.
+ */
+ dev_WARN_ONCE(&cxlr->dev,
+ port == cxled_to_port(cxled) &&
+ cxld->target_type != cxlr->type,
+ "%s:%s mismatch decoder type %d -> %d\n",
+ dev_name(&cxled_to_memdev(cxled)->dev),
+ dev_name(&cxld->dev), cxld->target_type, cxlr->type);
+ cxld->target_type = cxlr->type;
cxl_rr->decoder = cxld;
return 0;
}
+static void cxl_region_set_lock(struct cxl_region *cxlr,
+ struct cxl_decoder *cxld)
+{
+ if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags))
+ return;
+
+ set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
+ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
+}
+
/**
* cxl_port_attach_region() - track a region's interest in a port by endpoint
* @port: port to add a new region reference 'struct cxl_region_ref'
@@ -804,7 +1139,7 @@ static int cxl_port_attach_region(struct cxl_port *port,
unsigned long index;
int rc = -EBUSY;
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
cxl_rr = cxl_rr_load(port, cxlr);
if (cxl_rr) {
@@ -835,7 +1170,16 @@ static int cxl_port_attach_region(struct cxl_port *port,
nr_targets_inc = true;
}
} else {
- cxl_rr = alloc_region_ref(port, cxlr);
+ struct cxl_decoder *cxld;
+
+ cxld = cxl_port_pick_region_decoder(port, cxled, cxlr);
+ if (!cxld) {
+ dev_dbg(&cxlr->dev, "%s: no decoder available\n",
+ dev_name(&port->dev));
+ return -EBUSY;
+ }
+
+ cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld);
if (IS_ERR(cxl_rr)) {
dev_dbg(&cxlr->dev,
"%s: failed to allocate region reference\n",
@@ -844,12 +1188,34 @@ static int cxl_port_attach_region(struct cxl_port *port,
}
nr_targets_inc = true;
- rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr);
+ rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld);
if (rc)
goto out_erase;
}
cxld = cxl_rr->decoder;
+ /*
+ * the number of targets should not exceed the target_count
+ * of the decoder
+ */
+ if (is_switch_decoder(&cxld->dev)) {
+ struct cxl_switch_decoder *cxlsd;
+
+ cxlsd = to_cxl_switch_decoder(&cxld->dev);
+ if (cxl_rr->nr_targets > cxlsd->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s %s add: %s:%s @ %d overflows targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), dev_name(&cxlmd->dev),
+ dev_name(&cxled->cxld.dev), pos,
+ cxlsd->nr_targets);
+ rc = -ENXIO;
+ goto out_erase;
+ }
+ }
+
+ cxl_region_set_lock(cxlr, cxld);
+
rc = cxl_rr_ep_add(cxl_rr, cxled);
if (rc) {
dev_dbg(&cxlr->dev,
@@ -861,10 +1227,10 @@ static int cxl_port_attach_region(struct cxl_port *port,
dev_dbg(&cxlr->dev,
"%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
dev_name(&cxld->dev), dev_name(&cxlmd->dev),
dev_name(&cxled->cxld.dev), pos,
- ep ? ep->next ? dev_name(ep->next->uport) :
+ ep ? ep->next ? dev_name(ep->next->uport_dev) :
dev_name(&cxlmd->dev) :
"none",
cxl_rr->nr_eps, cxl_rr->nr_targets);
@@ -885,7 +1251,7 @@ static void cxl_port_detach_region(struct cxl_port *port,
struct cxl_region_ref *cxl_rr;
struct cxl_ep *ep = NULL;
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
cxl_rr = cxl_rr_load(port, cxlr);
if (!cxl_rr)
@@ -939,7 +1305,7 @@ static int check_last_peer(struct cxl_endpoint_decoder *cxled,
*/
if (pos < distance) {
dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
return -ENXIO;
}
@@ -949,7 +1315,7 @@ static int check_last_peer(struct cxl_endpoint_decoder *cxled,
if (ep->dport != ep_peer->dport) {
dev_dbg(&cxlr->dev,
"%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
dev_name(&cxlmd_peer->dev),
dev_name(&cxled_peer->cxld.dev));
@@ -959,12 +1325,56 @@ static int check_last_peer(struct cxl_endpoint_decoder *cxled,
return 0;
}
+static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+ struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
+ unsigned int interleave_mask;
+ u8 eiw;
+ u16 eig;
+ int high_pos, low_pos;
+
+ if (!test_bit(iw, &cxlhdm->iw_cap_mask))
+ return -ENXIO;
+ /*
+ * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection),
+ * if eiw < 8:
+ * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw]
+ * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
+ *
+ * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the
+ * interleave bits are none.
+ *
+ * if eiw >= 8:
+ * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3
+ * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
+ *
+ * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the
+ * interleave bits are none.
+ */
+ ways_to_eiw(iw, &eiw);
+ if (eiw == 0 || eiw == 8)
+ return 0;
+
+ granularity_to_eig(ig, &eig);
+ if (eiw > 8)
+ high_pos = eiw + eig - 1;
+ else
+ high_pos = eiw + eig + 7;
+ low_pos = eig + 8;
+ interleave_mask = GENMASK(high_pos, low_pos);
+ if (interleave_mask & ~cxlhdm->interleave_mask)
+ return -ENXIO;
+
+ return 0;
+}
+
static int cxl_port_setup_targets(struct cxl_port *port,
struct cxl_region *cxlr,
struct cxl_endpoint_decoder *cxled)
{
struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
- int parent_iw, parent_ig, ig, iw, rc, inc = 0, pos = cxled->pos;
+ int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
@@ -972,6 +1382,7 @@ static int cxl_port_setup_targets(struct cxl_port *port,
struct cxl_region_params *p = &cxlr->params;
struct cxl_decoder *cxld = cxl_rr->decoder;
struct cxl_switch_decoder *cxlsd;
+ struct cxl_port *iter = port;
u16 eig, peig;
u8 eiw, peiw;
@@ -981,23 +1392,33 @@ static int cxl_port_setup_targets(struct cxl_port *port,
*/
if (!is_power_of_2(cxl_rr->nr_targets)) {
dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
cxl_rr->nr_targets);
return -EINVAL;
}
cxlsd = to_cxl_switch_decoder(&cxld->dev);
if (cxl_rr->nr_targets_set) {
- int i, distance;
+ int i, distance = 1;
+ struct cxl_region_ref *cxl_rr_iter;
/*
- * Passthrough ports impose no distance requirements between
- * peers
+ * The "distance" between peer downstream ports represents which
+ * endpoint positions in the region interleave a given port can
+ * host.
+ *
+ * For example, at the root of a hierarchy the distance is
+ * always 1 as every index targets a different host-bridge. At
+ * each subsequent switch level those ports map every Nth region
+ * position where N is the width of the switch == distance.
*/
- if (port->nr_dports == 1)
- distance = 0;
- else
- distance = p->nr_targets / cxl_rr->nr_targets;
+ do {
+ cxl_rr_iter = cxl_rr_load(iter, cxlr);
+ distance *= cxl_rr_iter->nr_targets;
+ iter = to_cxl_port(iter->dev.parent);
+ } while (!is_cxl_root(iter));
+ distance *= cxlrd->cxlsd.cxld.interleave_ways;
+
for (i = 0; i < cxl_rr->nr_targets_set; i++)
if (ep->dport == cxlsd->target[i]) {
rc = check_last_peer(cxled, ep, cxl_rr,
@@ -1010,7 +1431,14 @@ static int cxl_port_setup_targets(struct cxl_port *port,
}
if (is_cxl_root(parent_port)) {
- parent_ig = cxlrd->cxlsd.cxld.interleave_granularity;
+ /*
+ * Root decoder IG is always set to value in CFMWS which
+ * may be different than this region's IG. We can use the
+ * region's IG here since interleave_granularity_store()
+ * does not allow interleaved host-bridges with
+ * root IG != region IG.
+ */
+ parent_ig = p->interleave_granularity;
parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
/*
* For purposes of address bit routing, use power-of-2 math for
@@ -1028,74 +1456,121 @@ static int cxl_port_setup_targets(struct cxl_port *port,
parent_iw = parent_cxld->interleave_ways;
}
- rc = granularity_to_cxl(parent_ig, &peig);
+ rc = granularity_to_eig(parent_ig, &peig);
if (rc) {
dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
- dev_name(parent_port->uport),
+ dev_name(parent_port->uport_dev),
dev_name(&parent_port->dev), parent_ig);
return rc;
}
- rc = ways_to_cxl(parent_iw, &peiw);
+ rc = ways_to_eiw(parent_iw, &peiw);
if (rc) {
dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
- dev_name(parent_port->uport),
+ dev_name(parent_port->uport_dev),
dev_name(&parent_port->dev), parent_iw);
return rc;
}
iw = cxl_rr->nr_targets;
- rc = ways_to_cxl(iw, &eiw);
+ rc = ways_to_eiw(iw, &eiw);
if (rc) {
dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
- dev_name(port->uport), dev_name(&port->dev), iw);
+ dev_name(port->uport_dev), dev_name(&port->dev), iw);
return rc;
}
/*
- * If @parent_port is masking address bits, pick the next unused address
- * bit to route @port's targets.
+ * Interleave granularity is a multiple of @parent_port granularity.
+ * Multiplier is the parent port interleave ways.
*/
- if (parent_iw > 1 && cxl_rr->nr_targets > 1) {
- u32 address_bit = max(peig + peiw, eiw + peig);
-
- eig = address_bit - eiw + 1;
- } else {
- eiw = peiw;
- eig = peig;
+ rc = granularity_to_eig(parent_ig * parent_iw, &eig);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s: invalid granularity calculation (%d * %d)\n",
+ dev_name(&parent_port->dev), parent_ig, parent_iw);
+ return rc;
}
- rc = cxl_to_granularity(eig, &ig);
+ rc = eig_to_granularity(eig, &ig);
if (rc) {
dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
256 << eig);
return rc;
}
- cxld->interleave_ways = iw;
- cxld->interleave_granularity = ig;
- cxld->hpa_range = (struct range) {
- .start = p->res->start,
- .end = p->res->end,
- };
- dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport),
+ if (iw > 8 || iw > cxlsd->nr_targets) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s:%s: ways: %d overflows targets: %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxld->dev), iw, cxlsd->nr_targets);
+ return -ENXIO;
+ }
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (cxld->interleave_ways != iw ||
+ (iw > 1 && cxld->interleave_granularity != ig) ||
+ !spa_maps_hpa(p, &cxld->hpa_range) ||
+ ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+ dev_err(&cxlr->dev,
+ "%s:%s %s expected iw: %d ig: %d %pr\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ __func__, iw, ig, p->res);
+ dev_err(&cxlr->dev,
+ "%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ __func__, cxld->interleave_ways,
+ cxld->interleave_granularity,
+ str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE),
+ cxld->hpa_range.start, cxld->hpa_range.end);
+ return -ENXIO;
+ }
+ } else {
+ rc = check_interleave_cap(cxld, iw, ig);
+ if (rc) {
+ dev_dbg(&cxlr->dev,
+ "%s:%s iw: %d ig: %d is not supported\n",
+ dev_name(port->uport_dev),
+ dev_name(&port->dev), iw, ig);
+ return rc;
+ }
+
+ cxld->interleave_ways = iw;
+ cxld->interleave_granularity = ig;
+ cxld->hpa_range = (struct range) {
+ .start = p->res->start,
+ .end = p->res->end,
+ };
+ }
+ dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev),
dev_name(&port->dev), iw, ig);
add_target:
if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
dev_dbg(&cxlr->dev,
"%s:%s: targets full trying to add %s:%s at %d\n",
- dev_name(port->uport), dev_name(&port->dev),
+ dev_name(port->uport_dev), dev_name(&port->dev),
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
return -ENXIO;
}
- cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
- inc = 1;
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) {
+ dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n",
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ dev_name(&cxlsd->cxld.dev),
+ dev_name(ep->dport->dport_dev),
+ cxl_rr->nr_targets_set);
+ return -ENXIO;
+ }
+ } else {
+ cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
+ cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
+ }
+ cxl_rr->nr_targets_set++;
out_target_set:
- cxl_rr->nr_targets_set += inc;
dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
- dev_name(port->uport), dev_name(&port->dev),
- cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport),
+ dev_name(port->uport_dev), dev_name(&port->dev),
+ cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
return 0;
@@ -1126,14 +1601,26 @@ static void cxl_region_teardown_targets(struct cxl_region *cxlr)
{
struct cxl_region_params *p = &cxlr->params;
struct cxl_endpoint_decoder *cxled;
+ struct cxl_dev_state *cxlds;
struct cxl_memdev *cxlmd;
struct cxl_port *iter;
struct cxl_ep *ep;
int i;
+ /*
+ * In the auto-discovery case skip automatic teardown since the
+ * address space is already active
+ */
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
+ return;
+
for (i = 0; i < p->nr_targets; i++) {
cxled = p->targets[i];
cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+
+ if (cxlds->rcd)
+ continue;
iter = cxled_to_port(cxled);
while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
@@ -1149,22 +1636,32 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
{
struct cxl_region_params *p = &cxlr->params;
struct cxl_endpoint_decoder *cxled;
+ struct cxl_dev_state *cxlds;
+ int i, rc, rch = 0, vh = 0;
struct cxl_memdev *cxlmd;
struct cxl_port *iter;
struct cxl_ep *ep;
- int i, rc;
for (i = 0; i < p->nr_targets; i++) {
cxled = p->targets[i];
cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+
+ /* validate that all targets agree on topology */
+ if (!cxlds->rcd) {
+ vh++;
+ } else {
+ rch++;
+ continue;
+ }
iter = cxled_to_port(cxled);
while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
iter = to_cxl_port(iter->dev.parent);
/*
- * Descend the topology tree programming targets while
- * looking for conflicts.
+ * Descend the topology tree programming / validating
+ * targets while looking for conflicts.
*/
for (ep = cxl_ep_load(iter, cxlmd); iter;
iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
@@ -1176,32 +1673,22 @@ static int cxl_region_setup_targets(struct cxl_region *cxlr)
}
}
+ if (rch && vh) {
+ dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
+ cxl_region_teardown_targets(cxlr);
+ return -ENXIO;
+ }
+
return 0;
}
-static int cxl_region_attach(struct cxl_region *cxlr,
- struct cxl_endpoint_decoder *cxled, int pos)
+static int cxl_region_validate_position(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled,
+ int pos)
{
- struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
- struct cxl_port *ep_port, *root_port, *iter;
struct cxl_region_params *p = &cxlr->params;
- struct cxl_dport *dport;
- int i, rc = -ENXIO;
-
- if (cxled->mode == CXL_DECODER_DEAD) {
- dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
- return -ENODEV;
- }
-
- /* all full of members, or interleave config not established? */
- if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
- dev_dbg(&cxlr->dev, "region already active\n");
- return -EBUSY;
- } else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
- dev_dbg(&cxlr->dev, "interleave config missing\n");
- return -ENXIO;
- }
+ int i;
if (pos < 0 || pos >= p->interleave_ways) {
dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
@@ -1226,7 +1713,7 @@ static int cxl_region_attach(struct cxl_region *cxlr,
struct cxl_endpoint_decoder *cxled_target;
struct cxl_memdev *cxlmd_target;
- cxled_target = p->targets[pos];
+ cxled_target = p->targets[i];
if (!cxled_target)
continue;
@@ -1240,6 +1727,295 @@ static int cxl_region_attach(struct cxl_region *cxlr,
}
}
+ return 0;
+}
+
+static int cxl_region_attach_position(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled,
+ const struct cxl_dport *dport, int pos)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
+ struct cxl_decoder *cxld = &cxlsd->cxld;
+ int iw = cxld->interleave_ways;
+ struct cxl_port *iter;
+ int rc;
+
+ if (dport != cxlrd->cxlsd.target[pos % iw]) {
+ dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ dev_name(&cxlrd->cxlsd.cxld.dev));
+ return -ENXIO;
+ }
+
+ for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent)) {
+ rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
+ if (rc)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
+ iter = to_cxl_port(iter->dev.parent))
+ cxl_port_detach_region(iter, cxlr, cxled);
+ return rc;
+}
+
+static int cxl_region_attach_auto(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos)
+{
+ struct cxl_region_params *p = &cxlr->params;
+
+ if (cxled->state != CXL_DECODER_STATE_AUTO) {
+ dev_err(&cxlr->dev,
+ "%s: unable to add decoder to autodetected region\n",
+ dev_name(&cxled->cxld.dev));
+ return -EINVAL;
+ }
+
+ if (pos >= 0) {
+ dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n",
+ dev_name(&cxled->cxld.dev), pos);
+ return -EINVAL;
+ }
+
+ if (p->nr_targets >= p->interleave_ways) {
+ dev_err(&cxlr->dev, "%s: no more target slots available\n",
+ dev_name(&cxled->cxld.dev));
+ return -ENXIO;
+ }
+
+ /*
+ * Temporarily record the endpoint decoder into the target array. Yes,
+ * this means that userspace can view devices in the wrong position
+ * before the region activates, and must be careful to understand when
+ * it might be racing region autodiscovery.
+ */
+ pos = p->nr_targets;
+ p->targets[pos] = cxled;
+ cxled->pos = pos;
+ p->nr_targets++;
+
+ return 0;
+}
+
+static int cmp_interleave_pos(const void *a, const void *b)
+{
+ struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a;
+ struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b;
+
+ return cxled_a->pos - cxled_b->pos;
+}
+
+static int match_switch_decoder_by_range(struct device *dev,
+ const void *data)
+{
+ struct cxl_switch_decoder *cxlsd;
+ const struct range *r1, *r2 = data;
+
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxlsd = to_cxl_switch_decoder(dev);
+ r1 = &cxlsd->cxld.hpa_range;
+
+ if (is_root_decoder(dev))
+ return range_contains(r1, r2);
+ return (r1->start == r2->start && r1->end == r2->end);
+}
+
+static int find_pos_and_ways(struct cxl_port *port, struct range *range,
+ int *pos, int *ways)
+{
+ struct cxl_switch_decoder *cxlsd;
+ struct cxl_port *parent;
+ struct device *dev;
+ int rc = -ENXIO;
+
+ parent = parent_port_of(port);
+ if (!parent)
+ return rc;
+
+ dev = device_find_child(&parent->dev, range,
+ match_switch_decoder_by_range);
+ if (!dev) {
+ dev_err(port->uport_dev,
+ "failed to find decoder mapping %#llx-%#llx\n",
+ range->start, range->end);
+ return rc;
+ }
+ cxlsd = to_cxl_switch_decoder(dev);
+ *ways = cxlsd->cxld.interleave_ways;
+
+ for (int i = 0; i < *ways; i++) {
+ if (cxlsd->target[i] == port->parent_dport) {
+ *pos = i;
+ rc = 0;
+ break;
+ }
+ }
+ put_device(dev);
+
+ if (rc)
+ dev_err(port->uport_dev,
+ "failed to find %s:%s in target list of %s\n",
+ dev_name(&port->dev),
+ dev_name(port->parent_dport->dport_dev),
+ dev_name(&cxlsd->cxld.dev));
+
+ return rc;
+}
+
+/**
+ * cxl_calc_interleave_pos() - calculate an endpoint position in a region
+ * @cxled: endpoint decoder member of given region
+ *
+ * The endpoint position is calculated by traversing the topology from
+ * the endpoint to the root decoder and iteratively applying this
+ * calculation:
+ *
+ * position = position * parent_ways + parent_pos;
+ *
+ * ...where @position is inferred from switch and root decoder target lists.
+ *
+ * Return: position >= 0 on success
+ * -ENXIO on failure
+ */
+static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_port *iter, *port = cxled_to_port(cxled);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct range *range = &cxled->cxld.hpa_range;
+ int parent_ways = 0, parent_pos = 0, pos = 0;
+ int rc;
+
+ /*
+ * Example: the expected interleave order of the 4-way region shown
+ * below is: mem0, mem2, mem1, mem3
+ *
+ * root_port
+ * / \
+ * host_bridge_0 host_bridge_1
+ * | | | |
+ * mem0 mem1 mem2 mem3
+ *
+ * In the example the calculator will iterate twice. The first iteration
+ * uses the mem position in the host-bridge and the ways of the host-
+ * bridge to generate the first, or local, position. The second
+ * iteration uses the host-bridge position in the root_port and the ways
+ * of the root_port to refine the position.
+ *
+ * A trace of the calculation per endpoint looks like this:
+ * mem0: pos = 0 * 2 + 0 mem2: pos = 0 * 2 + 0
+ * pos = 0 * 2 + 0 pos = 0 * 2 + 1
+ * pos: 0 pos: 1
+ *
+ * mem1: pos = 0 * 2 + 1 mem3: pos = 0 * 2 + 1
+ * pos = 1 * 2 + 0 pos = 1 * 2 + 1
+ * pos: 2 pos = 3
+ *
+ * Note that while this example is simple, the method applies to more
+ * complex topologies, including those with switches.
+ */
+
+ /* Iterate from endpoint to root_port refining the position */
+ for (iter = port; iter; iter = parent_port_of(iter)) {
+ if (is_cxl_root(iter))
+ break;
+
+ rc = find_pos_and_ways(iter, range, &parent_pos, &parent_ways);
+ if (rc)
+ return rc;
+
+ pos = pos * parent_ways + parent_pos;
+ }
+
+ dev_dbg(&cxlmd->dev,
+ "decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n",
+ dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent),
+ dev_name(&port->dev), range->start, range->end, pos);
+
+ return pos;
+}
+
+static int cxl_region_sort_targets(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int i, rc = 0;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+
+ cxled->pos = cxl_calc_interleave_pos(cxled);
+ /*
+ * Record that sorting failed, but still continue to calc
+ * cxled->pos so that follow-on code paths can reliably
+ * do p->targets[cxled->pos] to self-reference their entry.
+ */
+ if (cxled->pos < 0)
+ rc = -ENXIO;
+ }
+ /* Keep the cxlr target list in interleave position order */
+ sort(p->targets, p->nr_targets, sizeof(p->targets[0]),
+ cmp_interleave_pos, NULL);
+
+ dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful");
+ return rc;
+}
+
+static int cxl_region_attach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_port *ep_port, *root_port;
+ struct cxl_dport *dport;
+ int rc = -ENXIO;
+
+ rc = check_interleave_cap(&cxled->cxld, p->interleave_ways,
+ p->interleave_granularity);
+ if (rc) {
+ dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n",
+ dev_name(&cxled->cxld.dev), p->interleave_ways,
+ p->interleave_granularity);
+ return rc;
+ }
+
+ if (cxled->part < 0) {
+ dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
+ return -ENODEV;
+ }
+
+ if (cxlds->part[cxled->part].mode != cxlr->mode) {
+ dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
+ dev_name(&cxled->cxld.dev), cxlr->mode);
+ return -EINVAL;
+ }
+
+ /* all full of members, or interleave config not established? */
+ if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_dbg(&cxlr->dev, "region already active\n");
+ return -EBUSY;
+ }
+
+ if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_dbg(&cxlr->dev, "interleave config missing\n");
+ return -ENXIO;
+ }
+
+ if (p->nr_targets >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "region already has %d endpoints\n",
+ p->nr_targets);
+ return -EINVAL;
+ }
+
ep_port = cxled_to_port(cxled);
root_port = cxlrd_to_port(cxlrd);
dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
@@ -1250,13 +2026,6 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return -ENXIO;
}
- if (cxlrd->calc_hb(cxlrd, pos) != dport) {
- dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
- dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
- dev_name(&cxlrd->cxlsd.cxld.dev));
- return -ENXIO;
- }
-
if (cxled->cxld.target_type != cxlr->type) {
dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
@@ -1270,23 +2039,71 @@ static int cxl_region_attach(struct cxl_region *cxlr,
return -ENXIO;
}
- if (resource_size(cxled->dpa_res) * p->interleave_ways !=
+ if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
resource_size(p->res)) {
dev_dbg(&cxlr->dev,
- "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n",
+ "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
(u64)resource_size(cxled->dpa_res), p->interleave_ways,
- (u64)resource_size(p->res));
+ (u64)p->cache_size, (u64)resource_size(p->res));
return -EINVAL;
}
- for (iter = ep_port; !is_cxl_root(iter);
- iter = to_cxl_port(iter->dev.parent)) {
- rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
+ cxl_region_perf_data_calculate(cxlr, cxled);
+
+ if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
+ int i;
+
+ rc = cxl_region_attach_auto(cxlr, cxled, pos);
if (rc)
- goto err;
+ return rc;
+
+ /* await more targets to arrive... */
+ if (p->nr_targets < p->interleave_ways)
+ return 0;
+
+ /*
+ * All targets are here, which implies all PCI enumeration that
+ * affects this region has been completed. Walk the topology to
+ * sort the devices into their relative region decode position.
+ */
+ rc = cxl_region_sort_targets(cxlr);
+ if (rc)
+ return rc;
+
+ for (i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ ep_port = cxled_to_port(cxled);
+ dport = cxl_find_dport_by_dev(root_port,
+ ep_port->host_bridge);
+ rc = cxl_region_attach_position(cxlr, cxlrd, cxled,
+ dport, i);
+ if (rc)
+ return rc;
+ }
+
+ rc = cxl_region_setup_targets(cxlr);
+ if (rc)
+ return rc;
+
+ /*
+ * If target setup succeeds in the autodiscovery case
+ * then the region is already committed.
+ */
+ p->state = CXL_CONFIG_COMMIT;
+ cxl_region_shared_upstream_bandwidth_update(cxlr);
+
+ return 0;
}
+ rc = cxl_region_validate_position(cxlr, cxled, pos);
+ if (rc)
+ return rc;
+
+ rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos);
+ if (rc)
+ return rc;
+
p->targets[pos] = cxled;
cxled->pos = pos;
p->nr_targets++;
@@ -1294,8 +2111,9 @@ static int cxl_region_attach(struct cxl_region *cxlr,
if (p->nr_targets == p->interleave_ways) {
rc = cxl_region_setup_targets(cxlr);
if (rc)
- goto err_decrement;
+ return rc;
p->state = CXL_CONFIG_ACTIVE;
+ cxl_region_shared_upstream_bandwidth_update(cxlr);
}
cxled->cxld.interleave_ways = p->interleave_ways;
@@ -1305,44 +2123,66 @@ static int cxl_region_attach(struct cxl_region *cxlr,
.end = p->res->end,
};
- return 0;
+ if (p->nr_targets != p->interleave_ways)
+ return 0;
-err_decrement:
- p->nr_targets--;
-err:
- for (iter = ep_port; !is_cxl_root(iter);
- iter = to_cxl_port(iter->dev.parent))
- cxl_port_detach_region(iter, cxlr, cxled);
- return rc;
+ /*
+ * Test the auto-discovery position calculator function
+ * against this successfully created user-defined region.
+ * A fail message here means that this interleave config
+ * will fail when presented as CXL_REGION_F_AUTO.
+ */
+ for (int i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ int test_pos;
+
+ test_pos = cxl_calc_interleave_pos(cxled);
+ dev_dbg(&cxled->cxld.dev,
+ "Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n",
+ (test_pos == cxled->pos) ? "success" : "fail",
+ test_pos, cxled->pos);
+ }
+
+ return 0;
}
-static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
+static struct cxl_region *
+__cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ enum cxl_detach_mode mode)
{
- struct cxl_port *iter, *ep_port = cxled_to_port(cxled);
- struct cxl_region *cxlr = cxled->cxld.region;
struct cxl_region_params *p;
- int rc = 0;
- lockdep_assert_held_write(&cxl_region_rwsem);
+ lockdep_assert_held_write(&cxl_rwsem.region);
- if (!cxlr)
- return 0;
+ if (!cxled) {
+ p = &cxlr->params;
- p = &cxlr->params;
- get_device(&cxlr->dev);
+ if (pos >= p->interleave_ways) {
+ dev_dbg(&cxlr->dev, "position %d out of range %d\n",
+ pos, p->interleave_ways);
+ return NULL;
+ }
+
+ if (!p->targets[pos])
+ return NULL;
+ cxled = p->targets[pos];
+ } else {
+ cxlr = cxled->cxld.region;
+ if (!cxlr)
+ return NULL;
+ p = &cxlr->params;
+ }
+
+ if (mode == DETACH_INVALIDATE)
+ cxled->part = -1;
if (p->state > CXL_CONFIG_ACTIVE) {
- /*
- * TODO: tear down all impacted regions if a device is
- * removed out of order
- */
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- if (rc)
- goto out;
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
p->state = CXL_CONFIG_ACTIVE;
}
- for (iter = ep_port; !is_cxl_root(iter);
+ for (struct cxl_port *iter = cxled_to_port(cxled); !is_cxl_root(iter);
iter = to_cxl_port(iter->dev.parent))
cxl_port_detach_region(iter, cxlr, cxled);
@@ -1353,7 +2193,7 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
cxled->pos);
- goto out;
+ return NULL;
}
if (p->state == CXL_CONFIG_ACTIVE) {
@@ -1367,76 +2207,81 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
.end = -1,
};
- /* notify the region driver that one of its targets has departed */
- up_write(&cxl_region_rwsem);
- device_release_driver(&cxlr->dev);
- down_write(&cxl_region_rwsem);
-out:
- put_device(&cxlr->dev);
- return rc;
-}
-
-void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled)
-{
- down_write(&cxl_region_rwsem);
- cxled->mode = CXL_DECODER_DEAD;
- cxl_region_detach(cxled);
- up_write(&cxl_region_rwsem);
+ get_device(&cxlr->dev);
+ return cxlr;
}
-static int attach_target(struct cxl_region *cxlr, const char *decoder, int pos)
+/*
+ * Cleanup a decoder's interest in a region. There are 2 cases to
+ * handle, removing an unknown @cxled from a known position in a region
+ * (detach_target()) or removing a known @cxled from an unknown @cxlr
+ * (cxld_unregister())
+ *
+ * When the detachment finds a region release the region driver.
+ */
+int cxl_decoder_detach(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ enum cxl_detach_mode mode)
{
- struct device *dev;
- int rc;
+ struct cxl_region *detach;
- dev = bus_find_device_by_name(&cxl_bus_type, NULL, decoder);
- if (!dev)
- return -ENODEV;
+ /* when the decoder is being destroyed lock unconditionally */
+ if (mode == DETACH_INVALIDATE) {
+ guard(rwsem_write)(&cxl_rwsem.region);
+ detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
+ } else {
+ int rc;
- if (!is_endpoint_decoder(dev)) {
- put_device(dev);
- return -EINVAL;
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+ detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
}
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
- goto out;
- down_read(&cxl_dpa_rwsem);
- rc = cxl_region_attach(cxlr, to_cxl_endpoint_decoder(dev), pos);
- up_read(&cxl_dpa_rwsem);
- up_write(&cxl_region_rwsem);
-out:
- put_device(dev);
- return rc;
+ if (detach) {
+ device_release_driver(&detach->dev);
+ put_device(&detach->dev);
+ }
+ return 0;
}
-static int detach_target(struct cxl_region *cxlr, int pos)
+static int __attach_target(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ unsigned int state)
{
- struct cxl_region_params *p = &cxlr->params;
int rc;
- rc = down_write_killable(&cxl_region_rwsem);
- if (rc)
- return rc;
-
- if (pos >= p->interleave_ways) {
- dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
- p->interleave_ways);
- rc = -ENXIO;
- goto out;
+ if (state == TASK_INTERRUPTIBLE) {
+ ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
+ return rc;
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ return cxl_region_attach(cxlr, cxled, pos);
}
+ guard(rwsem_write)(&cxl_rwsem.region);
+ guard(rwsem_read)(&cxl_rwsem.dpa);
+ return cxl_region_attach(cxlr, cxled, pos);
+}
- if (!p->targets[pos]) {
- rc = 0;
- goto out;
- }
+static int attach_target(struct cxl_region *cxlr,
+ struct cxl_endpoint_decoder *cxled, int pos,
+ unsigned int state)
+{
+ int rc = __attach_target(cxlr, cxled, pos, state);
- rc = cxl_region_detach(p->targets[pos]);
-out:
- up_write(&cxl_region_rwsem);
+ if (rc == 0)
+ return 0;
+
+ dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n",
+ dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
return rc;
}
+static int detach_target(struct cxl_region *cxlr, int pos)
+{
+ return cxl_decoder_detach(cxlr, NULL, pos, DETACH_ONLY);
+}
+
static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
size_t len)
{
@@ -1444,8 +2289,23 @@ static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
if (sysfs_streq(buf, "\n"))
rc = detach_target(cxlr, pos);
- else
- rc = attach_target(cxlr, buf, pos);
+ else {
+ struct device *dev;
+
+ dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf);
+ if (!dev)
+ return -ENODEV;
+
+ if (!is_endpoint_decoder(dev)) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos,
+ TASK_INTERRUPTIBLE);
+out:
+ put_device(dev);
+ }
if (rc < 0)
return rc;
@@ -1529,6 +2389,8 @@ static const struct attribute_group *region_groups[] = {
&cxl_base_attribute_group,
&cxl_region_group,
&cxl_region_target_group,
+ &cxl_region_access0_coordinate_group,
+ &cxl_region_access1_coordinate_group,
NULL,
};
@@ -1565,7 +2427,7 @@ bool is_cxl_region(struct device *dev)
{
return dev->type == &cxl_region_type;
}
-EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL);
+EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL");
static struct cxl_region *to_cxl_region(struct device *dev)
{
@@ -1576,13 +2438,13 @@ static struct cxl_region *to_cxl_region(struct device *dev)
return container_of(dev, struct cxl_region, dev);
}
-static void unregister_region(void *dev)
+static void unregister_region(void *_cxlr)
{
- struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region *cxlr = _cxlr;
struct cxl_region_params *p = &cxlr->params;
int i;
- device_del(dev);
+ device_del(&cxlr->dev);
/*
* Now that region sysfs is shutdown, the parameter block is now
@@ -1593,7 +2455,7 @@ static void unregister_region(void *dev)
detach_target(cxlr, i);
cxl_region_iomem_release(cxlr);
- put_device(dev);
+ put_device(&cxlr->dev);
}
static struct lock_class_key cxl_region_key;
@@ -1622,10 +2484,92 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
dev->bus = &cxl_bus_type;
dev->type = &cxl_region_type;
cxlr->id = id;
+ cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld);
return cxlr;
}
+static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
+{
+ int cset = 0;
+ int rc;
+
+ for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
+ if (cxlr->coord[i].read_bandwidth) {
+ node_update_perf_attrs(nid, &cxlr->coord[i], i);
+ cset++;
+ }
+ }
+
+ if (!cset)
+ return false;
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access0_group());
+ if (rc)
+ dev_dbg(&cxlr->dev, "Failed to update access0 group\n");
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access1_group());
+ if (rc)
+ dev_dbg(&cxlr->dev, "Failed to update access1 group\n");
+
+ return true;
+}
+
+static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
+ unsigned long action, void *arg)
+{
+ struct cxl_region *cxlr = container_of(nb, struct cxl_region,
+ node_notifier);
+ struct node_notify *nn = arg;
+ int nid = nn->nid;
+ int region_nid;
+
+ if (action != NODE_ADDED_FIRST_MEMORY)
+ return NOTIFY_DONE;
+
+ /*
+ * No need to hold cxl_rwsem.region; region parameters are stable
+ * within the cxl_region driver.
+ */
+ region_nid = phys_to_target_node(cxlr->params.res->start);
+ if (nid != region_nid)
+ return NOTIFY_DONE;
+
+ /* No action needed if node bit already set */
+ if (node_test_and_set(nid, nodemask_region_seen))
+ return NOTIFY_DONE;
+
+ if (!cxl_region_update_coordinates(cxlr, nid))
+ return NOTIFY_DONE;
+
+ return NOTIFY_OK;
+}
+
+static int cxl_region_calculate_adistance(struct notifier_block *nb,
+ unsigned long nid, void *data)
+{
+ struct cxl_region *cxlr = container_of(nb, struct cxl_region,
+ adist_notifier);
+ struct access_coordinate *perf;
+ int *adist = data;
+ int region_nid;
+
+ /*
+ * No need to hold cxl_rwsem.region; region parameters are stable
+ * within the cxl_region driver.
+ */
+ region_nid = phys_to_target_node(cxlr->params.res->start);
+ if (nid != region_nid)
+ return NOTIFY_OK;
+
+ perf = &cxlr->coord[ACCESS_COORDINATE_CPU];
+
+ if (mt_perf_to_adistance(perf, adist))
+ return NOTIFY_OK;
+
+ return NOTIFY_STOP;
+}
+
/**
* devm_cxl_add_region - Adds a region to a decoder
* @cxlrd: root decoder
@@ -1641,7 +2585,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
*/
static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
int id,
- enum cxl_decoder_mode mode,
+ enum cxl_partition_mode mode,
enum cxl_decoder_type type)
{
struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
@@ -1664,11 +2608,11 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
if (rc)
goto err;
- rc = devm_add_action_or_reset(port->uport, unregister_region, cxlr);
+ rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr);
if (rc)
return ERR_PTR(rc);
- dev_dbg(port->uport, "%s: created %s\n",
+ dev_dbg(port->uport_dev, "%s: created %s\n",
dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
return cxlr;
@@ -1677,61 +2621,96 @@ err:
return ERR_PTR(rc);
}
+static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf)
+{
+ return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
+}
+
static ssize_t create_pmem_region_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+ return __create_region_show(to_cxl_root_decoder(dev), buf);
+}
- return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
+static ssize_t create_ram_region_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return __create_region_show(to_cxl_root_decoder(dev), buf);
}
-static ssize_t create_pmem_region_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
+static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
+ enum cxl_partition_mode mode, int id)
{
- struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
- struct cxl_region *cxlr;
- int id, rc;
+ int rc;
- rc = sscanf(buf, "region%d\n", &id);
- if (rc != 1)
- return -EINVAL;
+ switch (mode) {
+ case CXL_PARTMODE_RAM:
+ case CXL_PARTMODE_PMEM:
+ break;
+ default:
+ dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
+ return ERR_PTR(-EINVAL);
+ }
rc = memregion_alloc(GFP_KERNEL);
if (rc < 0)
- return rc;
+ return ERR_PTR(rc);
if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
memregion_free(rc);
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
}
- cxlr = devm_cxl_add_region(cxlrd, id, CXL_DECODER_PMEM,
- CXL_DECODER_EXPANDER);
+ return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM);
+}
+
+static ssize_t create_region_store(struct device *dev, const char *buf,
+ size_t len, enum cxl_partition_mode mode)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
+ struct cxl_region *cxlr;
+ int rc, id;
+
+ rc = sscanf(buf, "region%d\n", &id);
+ if (rc != 1)
+ return -EINVAL;
+
+ cxlr = __create_region(cxlrd, mode, id);
if (IS_ERR(cxlr))
return PTR_ERR(cxlr);
return len;
}
+
+static ssize_t create_pmem_region_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
+}
DEVICE_ATTR_RW(create_pmem_region);
+static ssize_t create_ram_region_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
+}
+DEVICE_ATTR_RW(create_ram_region);
+
static ssize_t region_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct cxl_decoder *cxld = to_cxl_decoder(dev);
ssize_t rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
- if (rc)
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
return rc;
if (cxld->region)
- rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
- else
- rc = sysfs_emit(buf, "\n");
- up_read(&cxl_region_rwsem);
-
- return rc;
+ return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
+ return sysfs_emit(buf, "\n");
}
DEVICE_ATTR_RO(region);
@@ -1760,7 +2739,7 @@ static ssize_t delete_region_store(struct device *dev,
if (IS_ERR(cxlr))
return PTR_ERR(cxlr);
- devm_release_action(port->uport, unregister_region, cxlr);
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
put_device(&cxlr->dev);
return len;
@@ -1796,7 +2775,7 @@ bool is_cxl_pmem_region(struct device *dev)
{
return dev->type == &cxl_pmem_region_type;
}
-EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL);
+EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL");
struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
{
@@ -1805,40 +2784,477 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev)
return NULL;
return container_of(dev, struct cxl_pmem_region, dev);
}
-EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL);
+EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL");
+
+struct cxl_poison_context {
+ struct cxl_port *port;
+ int part;
+ u64 offset;
+};
+
+static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
+ struct cxl_poison_context *ctx)
+{
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ const struct resource *res;
+ struct resource *p, *last;
+ u64 offset, length;
+ int rc = 0;
+
+ if (ctx->part < 0)
+ return 0;
+
+ /*
+ * Collect poison for the remaining unmapped resources after
+ * poison is collected by committed endpoints decoders.
+ */
+ for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
+ res = &cxlds->part[i].res;
+ for (p = res->child, last = NULL; p; p = p->sibling)
+ last = p;
+ if (last)
+ offset = last->end + 1;
+ else
+ offset = res->start;
+ length = res->end - offset + 1;
+ if (!length)
+ break;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
+ continue;
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+static int poison_by_decoder(struct device *dev, void *arg)
+{
+ struct cxl_poison_context *ctx = arg;
+ struct cxl_endpoint_decoder *cxled;
+ enum cxl_partition_mode mode;
+ struct cxl_dev_state *cxlds;
+ struct cxl_memdev *cxlmd;
+ u64 offset, length;
+ int rc = 0;
+
+ if (!is_endpoint_decoder(dev))
+ return rc;
+
+ cxled = to_cxl_endpoint_decoder(dev);
+ if (!cxled->dpa_res)
+ return rc;
+
+ cxlmd = cxled_to_memdev(cxled);
+ cxlds = cxlmd->cxlds;
+ mode = cxlds->part[cxled->part].mode;
+
+ if (cxled->skip) {
+ offset = cxled->dpa_res->start - cxled->skip;
+ length = cxled->skip;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
+ rc = 0;
+ if (rc)
+ return rc;
+ }
+
+ offset = cxled->dpa_res->start;
+ length = cxled->dpa_res->end - offset + 1;
+ rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
+ if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
+ rc = 0;
+ if (rc)
+ return rc;
+
+ /* Iterate until commit_end is reached */
+ if (cxled->cxld.id == ctx->port->commit_end) {
+ ctx->offset = cxled->dpa_res->end + 1;
+ ctx->part = cxled->part;
+ return 1;
+ }
+
+ return 0;
+}
+
+int cxl_get_poison_by_endpoint(struct cxl_port *port)
+{
+ struct cxl_poison_context ctx;
+ int rc = 0;
+
+ ctx = (struct cxl_poison_context) {
+ .port = port,
+ .part = -1,
+ };
+
+ rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
+ if (rc == 1)
+ rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev),
+ &ctx);
+
+ return rc;
+}
+
+struct cxl_dpa_to_region_context {
+ struct cxl_region *cxlr;
+ u64 dpa;
+};
+
+static int __cxl_dpa_to_region(struct device *dev, void *arg)
+{
+ struct cxl_dpa_to_region_context *ctx = arg;
+ struct cxl_endpoint_decoder *cxled;
+ struct cxl_region *cxlr;
+ u64 dpa = ctx->dpa;
+
+ if (!is_endpoint_decoder(dev))
+ return 0;
+
+ cxled = to_cxl_endpoint_decoder(dev);
+ if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
+ return 0;
+
+ if (!cxl_resource_contains_addr(cxled->dpa_res, dpa))
+ return 0;
+
+ /*
+ * Stop the region search (return 1) when an endpoint mapping is
+ * found. The region may not be fully constructed so offering
+ * the cxlr in the context structure is not guaranteed.
+ */
+ cxlr = cxled->cxld.region;
+ if (cxlr)
+ dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
+ dev_name(&cxlr->dev));
+ else
+ dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa,
+ dev_name(dev));
+
+ ctx->cxlr = cxlr;
+
+ return 1;
+}
+
+struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
+{
+ struct cxl_dpa_to_region_context ctx;
+ struct cxl_port *port;
+
+ ctx = (struct cxl_dpa_to_region_context) {
+ .dpa = dpa,
+ };
+ port = cxlmd->endpoint;
+ if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port))
+ device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
+
+ return ctx.cxlr;
+}
+
+static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int gran = p->interleave_granularity;
+ int ways = p->interleave_ways;
+ u64 offset;
+
+ /* Is the hpa in an expected chunk for its pos(-ition) */
+ offset = hpa - p->res->start;
+ offset = do_div(offset, gran * ways);
+ if ((offset >= pos * gran) && (offset < (pos + 1) * gran))
+ return true;
+
+ dev_dbg(&cxlr->dev,
+ "Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa);
+
+ return false;
+}
+
+#define CXL_POS_ZERO 0
+/**
+ * cxl_validate_translation_params
+ * @eiw: encoded interleave ways
+ * @eig: encoded interleave granularity
+ * @pos: position in interleave
+ *
+ * Callers pass CXL_POS_ZERO when no position parameter needs validating.
+ *
+ * Returns: 0 on success, -EINVAL on first invalid parameter
+ */
+int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
+{
+ int ways, gran;
+
+ if (eiw_to_ways(eiw, &ways)) {
+ pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
+ return -EINVAL;
+ }
+ if (eig_to_granularity(eig, &gran)) {
+ pr_debug("%s: invalid eig=%u\n", __func__, eig);
+ return -EINVAL;
+ }
+ if (pos < 0 || pos >= ways) {
+ pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos,
+ ways);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
+
+u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
+{
+ u64 dpa_offset, bits_lower, bits_upper, temp;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
+ * Lower bits [IG+7:0] pass through unchanged
+ * (eiw < 8)
+ * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
+ * Clear the position bits to isolate upper section, then
+ * reverse the left shift by eiw that occurred during DPA->HPA
+ * (eiw >= 8)
+ * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
+ * Extract upper bits from the correct bit range and divide by 3
+ * to recover the original DPA upper bits
+ */
+ bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
+ if (eiw < 8) {
+ temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
+ dpa_offset = temp >> eiw;
+ } else {
+ bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
+ dpa_offset = bits_upper << (eig + 8);
+ }
+ dpa_offset |= bits_lower;
+
+ return dpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
+
+int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
+{
+ unsigned int ways = 0;
+ u64 shifted, rem;
+ int pos, ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
+ if (ret)
+ return ret;
+
+ if (!eiw)
+ /* position is 0 if no interleaving */
+ return 0;
+
+ /*
+ * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
+ * eiw < 8
+ * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
+ * Per spec "remove IW bits starting with bit position IG+8"
+ * eiw >= 8
+ * Position is not explicitly stored in HPA_OFFSET bits. It is
+ * derived from the modulo operation of the upper bits using
+ * the total number of interleave ways.
+ */
+ if (eiw < 8) {
+ pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
+ } else {
+ shifted = hpa_offset >> (eig + 8);
+ eiw_to_ways(eiw, &ways);
+ div64_u64_rem(shifted, ways, &rem);
+ pos = rem;
+ }
+
+ return pos;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
+
+u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
+{
+ u64 mask_upper, hpa_offset, bits_upper;
+ int ret;
+
+ ret = cxl_validate_translation_params(eiw, eig, pos);
+ if (ret)
+ return ULLONG_MAX;
+
+ /*
+ * The device position in the region interleave set was removed
+ * from the offset at HPA->DPA translation. To reconstruct the
+ * HPA, place the 'pos' in the offset.
+ *
+ * The placement of 'pos' in the HPA is determined by interleave
+ * ways and granularity and is defined in the CXL Spec 3.0 Section
+ * 8.2.4.19.13 Implementation Note: Device Decode Logic
+ */
+
+ mask_upper = GENMASK_ULL(51, eig + 8);
+
+ if (eiw < 8) {
+ hpa_offset = (dpa_offset & mask_upper) << eiw;
+ hpa_offset |= pos << (eig + 8);
+ } else {
+ bits_upper = (dpa_offset & mask_upper) >> (eig + 8);
+ bits_upper = bits_upper * 3;
+ hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8);
+ }
+
+ /* The lower bits remain unchanged */
+ hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
+
+ return hpa_offset;
+}
+EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
+
+u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
+ u64 dpa)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_endpoint_decoder *cxled = NULL;
+ u64 dpa_offset, hpa_offset, hpa;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ for (int i = 0; i < p->nr_targets; i++) {
+ if (cxlmd == cxled_to_memdev(p->targets[i])) {
+ cxled = p->targets[i];
+ break;
+ }
+ }
+ if (!cxled)
+ return ULLONG_MAX;
+
+ pos = cxled->pos;
+ ways_to_eiw(p->interleave_ways, &eiw);
+ granularity_to_eig(p->interleave_granularity, &eig);
+
+ dpa_offset = dpa - cxl_dpa_resource_start(cxled);
+ hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
+
+ /* Apply the hpa_offset to the region base address */
+ hpa = hpa_offset + p->res->start + p->cache_size;
+
+ /* Root decoder translation overrides typical modulo decode */
+ if (cxlrd->ops.hpa_to_spa)
+ hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
+
+ if (!cxl_resource_contains_addr(p->res, hpa)) {
+ dev_dbg(&cxlr->dev,
+ "Addr trans fail: hpa 0x%llx not in region\n", hpa);
+ return ULLONG_MAX;
+ }
+
+ /* Simple chunk check, by pos & gran, only applies to modulo decodes */
+ if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
+ return ULLONG_MAX;
+
+ return hpa;
+}
+
+struct dpa_result {
+ struct cxl_memdev *cxlmd;
+ u64 dpa;
+};
+
+static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
+ struct dpa_result *result)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
+ struct cxl_endpoint_decoder *cxled;
+ u64 hpa, hpa_offset, dpa_offset;
+ u16 eig = 0;
+ u8 eiw = 0;
+ int pos;
+
+ lockdep_assert_held(&cxl_rwsem.region);
+ lockdep_assert_held(&cxl_rwsem.dpa);
+
+ /* Input validation ensures valid ways and gran */
+ granularity_to_eig(p->interleave_granularity, &eig);
+ ways_to_eiw(p->interleave_ways, &eiw);
+
+ /*
+ * If the root decoder has SPA to CXL HPA callback, use it. Otherwise
+ * CXL HPA is assumed to equal SPA.
+ */
+ if (cxlrd->ops.spa_to_hpa) {
+ hpa = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
+ hpa_offset = hpa - p->res->start;
+ } else {
+ hpa_offset = offset;
+ }
+
+ pos = cxl_calculate_position(hpa_offset, eiw, eig);
+ if (pos < 0 || pos >= p->nr_targets) {
+ dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
+ pos, p->nr_targets);
+ return -ENXIO;
+ }
+
+ dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
+
+ /* Look-up and return the result: a memdev and a DPA */
+ for (int i = 0; i < p->nr_targets; i++) {
+ cxled = p->targets[i];
+ if (cxled->pos != pos)
+ continue;
+ result->cxlmd = cxled_to_memdev(cxled);
+ result->dpa = cxl_dpa_resource_start(cxled) + dpa_offset;
+
+ return 0;
+ }
+ dev_err(&cxlr->dev, "No device found for position %d\n", pos);
+
+ return -ENXIO;
+}
static struct lock_class_key cxl_pmem_region_key;
-static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
+static int cxl_pmem_region_alloc(struct cxl_region *cxlr)
{
struct cxl_region_params *p = &cxlr->params;
- struct cxl_pmem_region *cxlr_pmem;
+ struct cxl_nvdimm_bridge *cxl_nvb;
struct device *dev;
int i;
- down_read(&cxl_region_rwsem);
- if (p->state != CXL_CONFIG_COMMIT) {
- cxlr_pmem = ERR_PTR(-ENXIO);
- goto out;
- }
+ guard(rwsem_read)(&cxl_rwsem.region);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return -ENXIO;
- cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets),
- GFP_KERNEL);
- if (!cxlr_pmem) {
- cxlr_pmem = ERR_PTR(-ENOMEM);
- goto out;
- }
+ struct cxl_pmem_region *cxlr_pmem __free(kfree) =
+ kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), GFP_KERNEL);
+ if (!cxlr_pmem)
+ return -ENOMEM;
cxlr_pmem->hpa_range.start = p->res->start;
cxlr_pmem->hpa_range.end = p->res->end;
- /* Snapshot the region configuration underneath the cxl_region_rwsem */
+ /* Snapshot the region configuration underneath the cxl_rwsem.region */
cxlr_pmem->nr_mappings = p->nr_targets;
for (i = 0; i < p->nr_targets; i++) {
struct cxl_endpoint_decoder *cxled = p->targets[i];
struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
+ /*
+ * Regions never span CXL root devices, so by definition the
+ * bridge for one device is the same for all.
+ */
+ if (i == 0) {
+ cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint);
+ if (!cxl_nvb)
+ return -ENODEV;
+ cxlr->cxl_nvb = cxl_nvb;
+ }
m->cxlmd = cxlmd;
get_device(&cxlmd->dev);
m->start = cxled->dpa_res->start;
@@ -1847,22 +3263,111 @@ static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr)
}
dev = &cxlr_pmem->dev;
- cxlr_pmem->cxlr = cxlr;
device_initialize(dev);
lockdep_set_class(&dev->mutex, &cxl_pmem_region_key);
device_set_pm_not_required(dev);
dev->parent = &cxlr->dev;
dev->bus = &cxl_bus_type;
dev->type = &cxl_pmem_region_type;
-out:
- up_read(&cxl_region_rwsem);
+ cxlr_pmem->cxlr = cxlr;
+ cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem);
+
+ return 0;
+}
+
+static void cxl_dax_region_release(struct device *dev)
+{
+ struct cxl_dax_region *cxlr_dax = to_cxl_dax_region(dev);
+
+ kfree(cxlr_dax);
+}
+
+static const struct attribute_group *cxl_dax_region_attribute_groups[] = {
+ &cxl_base_attribute_group,
+ NULL,
+};
+
+const struct device_type cxl_dax_region_type = {
+ .name = "cxl_dax_region",
+ .release = cxl_dax_region_release,
+ .groups = cxl_dax_region_attribute_groups,
+};
- return cxlr_pmem;
+static bool is_cxl_dax_region(struct device *dev)
+{
+ return dev->type == &cxl_dax_region_type;
+}
+
+struct cxl_dax_region *to_cxl_dax_region(struct device *dev)
+{
+ if (dev_WARN_ONCE(dev, !is_cxl_dax_region(dev),
+ "not a cxl_dax_region device\n"))
+ return NULL;
+ return container_of(dev, struct cxl_dax_region, dev);
}
+EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL");
+
+static struct lock_class_key cxl_dax_region_key;
-static void cxlr_pmem_unregister(void *dev)
+static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr)
{
- device_unregister(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ struct cxl_dax_region *cxlr_dax;
+ struct device *dev;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ if (p->state != CXL_CONFIG_COMMIT)
+ return ERR_PTR(-ENXIO);
+
+ cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL);
+ if (!cxlr_dax)
+ return ERR_PTR(-ENOMEM);
+
+ cxlr_dax->hpa_range.start = p->res->start;
+ cxlr_dax->hpa_range.end = p->res->end;
+
+ dev = &cxlr_dax->dev;
+ cxlr_dax->cxlr = cxlr;
+ device_initialize(dev);
+ lockdep_set_class(&dev->mutex, &cxl_dax_region_key);
+ device_set_pm_not_required(dev);
+ dev->parent = &cxlr->dev;
+ dev->bus = &cxl_bus_type;
+ dev->type = &cxl_dax_region_type;
+
+ return cxlr_dax;
+}
+
+static void cxlr_pmem_unregister(void *_cxlr_pmem)
+{
+ struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem;
+ struct cxl_region *cxlr = cxlr_pmem->cxlr;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+ /*
+ * Either the bridge is in ->remove() context under the device_lock(),
+ * or cxlr_release_nvdimm() is cancelling the bridge's release action
+ * for @cxlr_pmem and doing it itself (while manually holding the bridge
+ * lock).
+ */
+ device_lock_assert(&cxl_nvb->dev);
+ cxlr->cxlr_pmem = NULL;
+ cxlr_pmem->cxlr = NULL;
+ device_unregister(&cxlr_pmem->dev);
+}
+
+static void cxlr_release_nvdimm(void *_cxlr)
+{
+ struct cxl_region *cxlr = _cxlr;
+ struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb;
+
+ scoped_guard(device, &cxl_nvb->dev) {
+ if (cxlr->cxlr_pmem)
+ devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister,
+ cxlr->cxlr_pmem);
+ }
+ cxlr->cxl_nvb = NULL;
+ put_device(&cxl_nvb->dev);
}
/**
@@ -1874,12 +3379,15 @@ static void cxlr_pmem_unregister(void *dev)
static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
{
struct cxl_pmem_region *cxlr_pmem;
+ struct cxl_nvdimm_bridge *cxl_nvb;
struct device *dev;
int rc;
- cxlr_pmem = cxl_pmem_region_alloc(cxlr);
- if (IS_ERR(cxlr_pmem))
- return PTR_ERR(cxlr_pmem);
+ rc = cxl_pmem_region_alloc(cxlr);
+ if (rc)
+ return rc;
+ cxlr_pmem = cxlr->cxlr_pmem;
+ cxl_nvb = cxlr->cxl_nvb;
dev = &cxlr_pmem->dev;
rc = dev_set_name(dev, "pmem_region%d", cxlr->id);
@@ -1893,39 +3401,580 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr)
dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
dev_name(dev));
- return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev);
+ scoped_guard(device, &cxl_nvb->dev) {
+ if (cxl_nvb->dev.driver)
+ rc = devm_add_action_or_reset(&cxl_nvb->dev,
+ cxlr_pmem_unregister,
+ cxlr_pmem);
+ else
+ rc = -ENXIO;
+ }
+
+ if (rc)
+ goto err_bridge;
+
+ /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */
+ return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr);
err:
put_device(dev);
+err_bridge:
+ put_device(&cxl_nvb->dev);
+ cxlr->cxl_nvb = NULL;
return rc;
}
-static int cxl_region_probe(struct device *dev)
+static void cxlr_dax_unregister(void *_cxlr_dax)
{
- struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_dax_region *cxlr_dax = _cxlr_dax;
+
+ device_unregister(&cxlr_dax->dev);
+}
+
+static int devm_cxl_add_dax_region(struct cxl_region *cxlr)
+{
+ struct cxl_dax_region *cxlr_dax;
+ struct device *dev;
+ int rc;
+
+ cxlr_dax = cxl_dax_region_alloc(cxlr);
+ if (IS_ERR(cxlr_dax))
+ return PTR_ERR(cxlr_dax);
+
+ dev = &cxlr_dax->dev;
+ rc = dev_set_name(dev, "dax_region%d", cxlr->id);
+ if (rc)
+ goto err;
+
+ rc = device_add(dev);
+ if (rc)
+ goto err;
+
+ dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent),
+ dev_name(dev));
+
+ return devm_add_action_or_reset(&cxlr->dev, cxlr_dax_unregister,
+ cxlr_dax);
+err:
+ put_device(dev);
+ return rc;
+}
+
+static int match_decoder_by_range(struct device *dev, const void *data)
+{
+ const struct range *r1, *r2 = data;
+ struct cxl_decoder *cxld;
+
+ if (!is_switch_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ r1 = &cxld->hpa_range;
+ return range_contains(r1, r2);
+}
+
+static struct cxl_decoder *
+cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa)
+{
+ struct device *cxld_dev = device_find_child(&port->dev, hpa,
+ match_decoder_by_range);
+
+ return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL;
+}
+
+static struct cxl_root_decoder *
+cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxled_to_port(cxled);
+ struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
+ struct cxl_decoder *root, *cxld = &cxled->cxld;
+ struct range *hpa = &cxld->hpa_range;
+
+ root = cxl_port_find_switch_decoder(&cxl_root->port, hpa);
+ if (!root) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s no CXL window for range %#llx:%#llx\n",
+ dev_name(&cxlmd->dev), dev_name(&cxld->dev),
+ cxld->hpa_range.start, cxld->hpa_range.end);
+ return NULL;
+ }
+
+ return to_cxl_root_decoder(&root->dev);
+}
+
+static int match_region_by_range(struct device *dev, const void *data)
+{
+ struct cxl_region_params *p;
+ struct cxl_region *cxlr;
+ const struct range *r = data;
+
+ if (!is_cxl_region(dev))
+ return 0;
+
+ cxlr = to_cxl_region(dev);
+ p = &cxlr->params;
+
+ guard(rwsem_read)(&cxl_rwsem.region);
+ return spa_maps_hpa(p, r);
+}
+
+static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
+ struct resource *res)
+{
+ struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
struct cxl_region_params *p = &cxlr->params;
+ resource_size_t size = resource_size(res);
+ resource_size_t cache_size, start;
+
+ cache_size = cxlrd->cache_size;
+ if (!cache_size)
+ return 0;
+
+ if (size != cache_size) {
+ dev_warn(&cxlr->dev,
+ "Extended Linear Cache size %pa != CXL size %pa. No Support!",
+ &cache_size, &size);
+ return -ENXIO;
+ }
+
+ /*
+ * Move the start of the range to where the cache range starts. The
+ * implementation assumes that the cache range is in front of the
+ * CXL range. This is not dictated by the HMAT spec but is how the
+ * current known implementation is configured.
+ *
+ * The cache range is expected to be within the CFMWS. The adjusted
+ * res->start should not be less than cxlrd->res->start.
+ */
+ start = res->start - cache_size;
+ if (start < cxlrd->res->start)
+ return -ENXIO;
+
+ res->start = start;
+ p->cache_size = cache_size;
+
+ return 0;
+}
+
+static int __construct_region(struct cxl_region *cxlr,
+ struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct range *hpa = &cxled->cxld.hpa_range;
+ struct cxl_region_params *p;
+ struct resource *res;
int rc;
- rc = down_read_interruptible(&cxl_region_rwsem);
+ guard(rwsem_write)(&cxl_rwsem.region);
+ p = &cxlr->params;
+ if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s autodiscovery interrupted\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__);
+ return -EBUSY;
+ }
+
+ set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
+
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
+ dev_name(&cxlr->dev));
+
+ rc = cxl_extended_linear_cache_resize(cxlr, res);
+ if (rc && rc != -EOPNOTSUPP) {
+ /*
+ * Failing to support extended linear cache region resize does not
+ * prevent the region from functioning. Only causes cxl list showing
+ * incorrect region size.
+ */
+ dev_warn(cxlmd->dev.parent,
+ "Extended linear cache calculation failed rc:%d\n", rc);
+ }
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
+ if (rc)
+ return rc;
+
+ rc = insert_resource(cxlrd->res, res);
if (rc) {
+ /*
+ * Platform-firmware may not have split resources like "System
+ * RAM" on CXL window boundaries see cxl_region_iomem_release()
+ */
+ dev_warn(cxlmd->dev.parent,
+ "%s:%s: %s %s cannot insert resource\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, dev_name(&cxlr->dev));
+ }
+
+ p->res = res;
+ p->interleave_ways = cxled->cxld.interleave_ways;
+ p->interleave_granularity = cxled->cxld.interleave_granularity;
+ p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
+
+ rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
+ if (rc)
+ return rc;
+
+ dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
+ dev_name(&cxlr->dev), p->res, p->interleave_ways,
+ p->interleave_granularity);
+
+ /* ...to match put_device() in cxl_add_to_region() */
+ get_device(&cxlr->dev);
+
+ return 0;
+}
+
+/* Establish an empty region covering the given HPA range */
+static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
+ struct cxl_endpoint_decoder *cxled)
+{
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+ struct cxl_port *port = cxlrd_to_port(cxlrd);
+ struct cxl_dev_state *cxlds = cxlmd->cxlds;
+ int rc, part = READ_ONCE(cxled->part);
+ struct cxl_region *cxlr;
+
+ do {
+ cxlr = __create_region(cxlrd, cxlds->part[part].mode,
+ atomic_read(&cxlrd->region_id));
+ } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
+
+ if (IS_ERR(cxlr)) {
+ dev_err(cxlmd->dev.parent,
+ "%s:%s: %s failed assign region: %ld\n",
+ dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
+ __func__, PTR_ERR(cxlr));
+ return cxlr;
+ }
+
+ rc = __construct_region(cxlr, cxlrd, cxled);
+ if (rc) {
+ devm_release_action(port->uport_dev, unregister_region, cxlr);
+ return ERR_PTR(rc);
+ }
+
+ return cxlr;
+}
+
+static struct cxl_region *
+cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa)
+{
+ struct device *region_dev;
+
+ region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa,
+ match_region_by_range);
+ if (!region_dev)
+ return NULL;
+
+ return to_cxl_region(region_dev);
+}
+
+int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
+{
+ struct range *hpa = &cxled->cxld.hpa_range;
+ struct cxl_region_params *p;
+ bool attach = false;
+ int rc;
+
+ struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) =
+ cxl_find_root_decoder(cxled);
+ if (!cxlrd)
+ return -ENXIO;
+
+ /*
+ * Ensure that if multiple threads race to construct_region() for @hpa
+ * one does the construction and the others add to that.
+ */
+ mutex_lock(&cxlrd->range_lock);
+ struct cxl_region *cxlr __free(put_cxl_region) =
+ cxl_find_region_by_range(cxlrd, hpa);
+ if (!cxlr)
+ cxlr = construct_region(cxlrd, cxled);
+ mutex_unlock(&cxlrd->range_lock);
+
+ rc = PTR_ERR_OR_ZERO(cxlr);
+ if (rc)
+ return rc;
+
+ attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
+
+ scoped_guard(rwsem_read, &cxl_rwsem.region) {
+ p = &cxlr->params;
+ attach = p->state == CXL_CONFIG_COMMIT;
+ }
+
+ if (attach) {
+ /*
+ * If device_attach() fails the range may still be active via
+ * the platform-firmware memory map, otherwise the driver for
+ * regions is local to this file, so driver matching can't fail.
+ */
+ if (device_attach(&cxlr->dev) < 0)
+ dev_err(&cxlr->dev, "failed to enable, range: %pr\n",
+ p->res);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
+
+u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
+{
+ struct cxl_region_ref *iter;
+ unsigned long index;
+
+ if (!endpoint)
+ return ~0ULL;
+
+ guard(rwsem_write)(&cxl_rwsem.region);
+
+ xa_for_each(&endpoint->regions, index, iter) {
+ struct cxl_region_params *p = &iter->region->params;
+
+ if (cxl_resource_contains_addr(p->res, spa)) {
+ if (!p->cache_size)
+ return ~0ULL;
+
+ if (spa >= p->res->start + p->cache_size)
+ return spa - p->cache_size;
+
+ return spa + p->cache_size;
+ }
+ }
+
+ return ~0ULL;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
+
+static int is_system_ram(struct resource *res, void *arg)
+{
+ struct cxl_region *cxlr = arg;
+ struct cxl_region_params *p = &cxlr->params;
+
+ dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res);
+ return 1;
+}
+
+static void shutdown_notifiers(void *_cxlr)
+{
+ struct cxl_region *cxlr = _cxlr;
+
+ unregister_node_notifier(&cxlr->node_notifier);
+ unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
+}
+
+static void remove_debugfs(void *dentry)
+{
+ debugfs_remove_recursive(dentry);
+}
+
+static int validate_region_offset(struct cxl_region *cxlr, u64 offset)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ resource_size_t region_size;
+ u64 hpa;
+
+ if (offset < p->cache_size) {
+ dev_err(&cxlr->dev,
+ "Offset %#llx is within extended linear cache %pa\n",
+ offset, &p->cache_size);
+ return -EINVAL;
+ }
+
+ region_size = resource_size(p->res);
+ if (offset >= region_size) {
+ dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n",
+ offset, &region_size);
+ return -EINVAL;
+ }
+
+ hpa = p->res->start + offset;
+ if (hpa < p->res->start || hpa > p->res->end) {
+ dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa,
+ p->res);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int cxl_region_debugfs_poison_inject(void *data, u64 offset)
+{
+ struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
+ struct cxl_region *cxlr = data;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ if (validate_region_offset(cxlr, offset))
+ return -EINVAL;
+
+ offset -= cxlr->params.cache_size;
+ rc = region_offset_to_dpa_result(cxlr, offset, &result);
+ if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
+ dev_dbg(&cxlr->dev,
+ "Failed to resolve DPA for region offset %#llx rc %d\n",
+ offset, rc);
+
+ return rc ? rc : -EINVAL;
+ }
+
+ return cxl_inject_poison_locked(result.cxlmd, result.dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
+ cxl_region_debugfs_poison_inject, "%llx\n");
+
+static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
+{
+ struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
+ struct cxl_region *cxlr = data;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
+ return rc;
+
+ ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
+ return rc;
+
+ if (validate_region_offset(cxlr, offset))
+ return -EINVAL;
+
+ offset -= cxlr->params.cache_size;
+ rc = region_offset_to_dpa_result(cxlr, offset, &result);
+ if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
+ dev_dbg(&cxlr->dev,
+ "Failed to resolve DPA for region offset %#llx rc %d\n",
+ offset, rc);
+
+ return rc ? rc : -EINVAL;
+ }
+
+ return cxl_clear_poison_locked(result.cxlmd, result.dpa);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
+ cxl_region_debugfs_poison_clear, "%llx\n");
+
+static int cxl_region_can_probe(struct cxl_region *cxlr)
+{
+ struct cxl_region_params *p = &cxlr->params;
+ int rc;
+
+ ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
+ if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) {
dev_dbg(&cxlr->dev, "probe interrupted\n");
return rc;
}
if (p->state < CXL_CONFIG_COMMIT) {
dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
- rc = -ENXIO;
+ return -ENXIO;
}
+ if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
+ dev_err(&cxlr->dev,
+ "failed to activate, re-commit region and retry\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int cxl_region_probe(struct device *dev)
+{
+ struct cxl_region *cxlr = to_cxl_region(dev);
+ struct cxl_region_params *p = &cxlr->params;
+ bool poison_supported = true;
+ int rc;
+
+ rc = cxl_region_can_probe(cxlr);
+ if (rc)
+ return rc;
+
/*
* From this point on any path that changes the region's state away from
* CXL_CONFIG_COMMIT is also responsible for releasing the driver.
*/
- up_read(&cxl_region_rwsem);
+
+ cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback;
+ cxlr->node_notifier.priority = CXL_CALLBACK_PRI;
+ register_node_notifier(&cxlr->node_notifier);
+
+ cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
+ cxlr->adist_notifier.priority = 100;
+ register_mt_adistance_algorithm(&cxlr->adist_notifier);
+
+ rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr);
+ if (rc)
+ return rc;
+
+ /* Create poison attributes if all memdevs support the capabilities */
+ for (int i = 0; i < p->nr_targets; i++) {
+ struct cxl_endpoint_decoder *cxled = p->targets[i];
+ struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+ if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) ||
+ !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) {
+ poison_supported = false;
+ break;
+ }
+ }
+
+ if (poison_supported) {
+ struct dentry *dentry;
+
+ dentry = cxl_debugfs_create_dir(dev_name(dev));
+ debugfs_create_file("inject_poison", 0200, dentry, cxlr,
+ &cxl_poison_inject_fops);
+ debugfs_create_file("clear_poison", 0200, dentry, cxlr,
+ &cxl_poison_clear_fops);
+ rc = devm_add_action_or_reset(dev, remove_debugfs, dentry);
+ if (rc)
+ return rc;
+ }
switch (cxlr->mode) {
- case CXL_DECODER_PMEM:
+ case CXL_PARTMODE_PMEM:
+ rc = devm_cxl_region_edac_register(cxlr);
+ if (rc)
+ dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
+ cxlr->id);
+
return devm_cxl_add_pmem_region(cxlr);
+ case CXL_PARTMODE_RAM:
+ rc = devm_cxl_region_edac_register(cxlr);
+ if (rc)
+ dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
+ cxlr->id);
+
+ /*
+ * The region can not be manged by CXL if any portion of
+ * it is already online as 'System RAM'
+ */
+ if (walk_iomem_res_desc(IORES_DESC_NONE,
+ IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+ p->res->start, p->res->end, cxlr,
+ is_system_ram) > 0)
+ return 0;
+ return devm_cxl_add_dax_region(cxlr);
default:
dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
cxlr->mode);
@@ -1949,5 +3998,6 @@ void cxl_region_exit(void)
cxl_driver_unregister(&cxl_region_driver);
}
-MODULE_IMPORT_NS(CXL);
+MODULE_IMPORT_NS("CXL");
+MODULE_IMPORT_NS("DEVMEM");
MODULE_ALIAS_CXL(CXL_DEVICE_REGION);