diff options
Diffstat (limited to 'drivers/cxl/core/region.c')
-rw-r--r-- | drivers/cxl/core/region.c | 1056 |
1 files changed, 740 insertions, 316 deletions
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 5c186e0a39b9..6e5e1460068d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -9,6 +9,7 @@ #include <linux/uuid.h> #include <linux/sort.h> #include <linux/idr.h> +#include <linux/memory-tiers.h> #include <cxlmem.h> #include <cxl.h> #include "core.h" @@ -143,7 +144,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, rc = down_read_interruptible(&cxl_region_rwsem); if (rc) return rc; - if (cxlr->mode != CXL_DECODER_PMEM) + if (cxlr->mode != CXL_PARTMODE_PMEM) rc = sysfs_emit(buf, "\n"); else rc = sysfs_emit(buf, "%pUb\n", &p->uuid); @@ -230,30 +231,27 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr) &cxlr->dev, "Bypassing cpu_cache_invalidate_memregion() for testing!\n"); return 0; - } else { - dev_err(&cxlr->dev, - "Failed to synchronize CPU cache state\n"); - return -ENXIO; } + dev_WARN(&cxlr->dev, + "Failed to synchronize CPU cache state\n"); + return -ENXIO; } cpu_cache_invalidate_memregion(IORES_DESC_CXL); return 0; } -static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) +static void cxl_region_decode_reset(struct cxl_region *cxlr, int count) { struct cxl_region_params *p = &cxlr->params; - int i, rc = 0; + int i; /* - * Before region teardown attempt to flush, and if the flush - * fails cancel the region teardown for data consistency - * concerns + * Before region teardown attempt to flush, evict any data cached for + * this region, or scream loudly about missing arch / platform support + * for CXL teardown. */ - rc = cxl_region_invalidate_memregion(cxlr); - if (rc) - return rc; + cxl_region_invalidate_memregion(cxlr); for (i = count - 1; i >= 0; i--) { struct cxl_endpoint_decoder *cxled = p->targets[i]; @@ -276,23 +274,17 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) cxl_rr = cxl_rr_load(iter, cxlr); cxld = cxl_rr->decoder; if (cxld->reset) - rc = cxld->reset(cxld); - if (rc) - return rc; + cxld->reset(cxld); set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); } endpoint_reset: - rc = cxled->cxld.reset(&cxled->cxld); - if (rc) - return rc; + cxled->cxld.reset(&cxled->cxld); set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); } /* all decoders associated with this region have been torn down */ clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); - - return 0; } static int commit_decoder(struct cxl_decoder *cxld) @@ -408,16 +400,8 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr, * still pending. */ if (p->state == CXL_CONFIG_RESET_PENDING) { - rc = cxl_region_decode_reset(cxlr, p->interleave_ways); - /* - * Revert to committed since there may still be active - * decoders associated with this region, or move forward - * to active to mark the reset successful - */ - if (rc) - p->state = CXL_CONFIG_COMMIT; - else - p->state = CXL_CONFIG_ACTIVE; + cxl_region_decode_reset(cxlr, p->interleave_ways); + p->state = CXL_CONFIG_ACTIVE; } } @@ -456,7 +440,7 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, * Support tooling that expects to find a 'uuid' attribute for all * regions regardless of mode. */ - if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM) return 0444; return a->mode; } @@ -618,8 +602,16 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_region *cxlr = to_cxl_region(dev); + const char *desc; + + if (cxlr->mode == CXL_PARTMODE_RAM) + desc = "ram"; + else if (cxlr->mode == CXL_PARTMODE_PMEM) + desc = "pmem"; + else + desc = ""; - return sysfs_emit(buf, "%s\n", cxl_decoder_mode_name(cxlr->mode)); + return sysfs_emit(buf, "%s\n", desc); } static DEVICE_ATTR_RO(mode); @@ -645,7 +637,7 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) /* ways, granularity and uuid (if PMEM) need to be set before HPA */ if (!p->interleave_ways || !p->interleave_granularity || - (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid))) return -ENXIO; div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder); @@ -793,31 +785,70 @@ out: return rc; } -static int match_free_decoder(struct device *dev, void *data) +static int check_commit_order(struct device *dev, void *data) { + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + /* + * if port->commit_end is not the only free decoder, then out of + * order shutdown has occurred, block further allocations until + * that is resolved + */ + if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) + return -EBUSY; + return 0; +} + +static int match_free_decoder(struct device *dev, const void *data) +{ + struct cxl_port *port = to_cxl_port(dev->parent); struct cxl_decoder *cxld; - int *id = data; + int rc; if (!is_switch_decoder(dev)) return 0; cxld = to_cxl_decoder(dev); - /* enforce ordered allocation */ - if (cxld->id != *id) + if (cxld->id != port->commit_end + 1) return 0; - if (!cxld->region) - return 1; + if (cxld->region) { + dev_dbg(dev->parent, + "next decoder to commit (%s) is already reserved (%s)\n", + dev_name(dev), dev_name(&cxld->region->dev)); + return 0; + } - (*id)++; + rc = device_for_each_child_reverse_from(dev->parent, dev, NULL, + check_commit_order); + if (rc) { + dev_dbg(dev->parent, + "unable to allocate %s due to out of order shutdown\n", + dev_name(dev)); + return 0; + } + return 1; +} - return 0; +static bool region_res_match_cxl_range(const struct cxl_region_params *p, + struct range *range) +{ + if (!p->res) + return false; + + /* + * If an extended linear cache region then the CXL range is assumed + * to be fronted by the DRAM range in current known implementation. + * This assumption will be made until a variant implementation exists. + */ + return p->res->start + p->cache_size == range->start && + p->res->end == range->end; } -static int match_auto_decoder(struct device *dev, void *data) +static int match_auto_decoder(struct device *dev, const void *data) { - struct cxl_region_params *p = data; + const struct cxl_region_params *p = data; struct cxl_decoder *cxld; struct range *r; @@ -827,19 +858,31 @@ static int match_auto_decoder(struct device *dev, void *data) cxld = to_cxl_decoder(dev); r = &cxld->hpa_range; - if (p->res && p->res->start == r->start && p->res->end == r->end) + if (region_res_match_cxl_range(p, r)) return 1; return 0; } +/** + * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region + * @port: a port in the ancestry of the endpoint implied by @cxled + * @cxled: endpoint decoder to be, or currently, mapped by @port + * @cxlr: region to establish, or validate, decode @port + * + * In the region creation path cxl_port_pick_region_decoder() is an + * allocator to find a free port. In the region assembly path, it is + * recalling the decoder that platform firmware picked for validation + * purposes. + * + * The result is recorded in a 'struct cxl_region_ref' in @port. + */ static struct cxl_decoder * -cxl_region_find_decoder(struct cxl_port *port, - struct cxl_endpoint_decoder *cxled, - struct cxl_region *cxlr) +cxl_port_pick_region_decoder(struct cxl_port *port, + struct cxl_endpoint_decoder *cxled, + struct cxl_region *cxlr) { struct device *dev; - int id = 0; if (port == cxled_to_port(cxled)) return &cxled->cxld; @@ -848,7 +891,7 @@ cxl_region_find_decoder(struct cxl_port *port, dev = device_find_child(&port->dev, &cxlr->params, match_auto_decoder); else - dev = device_find_child(&port->dev, &id, match_free_decoder); + dev = device_find_child(&port->dev, NULL, match_free_decoder); if (!dev) return NULL; /* @@ -885,7 +928,8 @@ static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter, static struct cxl_region_ref * alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled) + struct cxl_endpoint_decoder *cxled, + struct cxl_decoder *cxld) { struct cxl_region_params *p = &cxlr->params; struct cxl_region_ref *cxl_rr, *iter; @@ -899,9 +943,6 @@ alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr, continue; if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { - struct cxl_decoder *cxld; - - cxld = cxl_region_find_decoder(port, cxled, cxlr); if (auto_order_ok(port, iter->region, cxld)) continue; } @@ -983,19 +1024,11 @@ static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr, return 0; } -static int cxl_rr_alloc_decoder(struct cxl_port *port, struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled, - struct cxl_region_ref *cxl_rr) +static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, + struct cxl_region_ref *cxl_rr, + struct cxl_decoder *cxld) { - struct cxl_decoder *cxld; - - cxld = cxl_region_find_decoder(port, cxled, cxlr); - if (!cxld) { - dev_dbg(&cxlr->dev, "%s: no decoder available\n", - dev_name(&port->dev)); - return -EBUSY; - } - if (cxld->region) { dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n", dev_name(&port->dev), dev_name(&cxld->dev), @@ -1086,7 +1119,16 @@ static int cxl_port_attach_region(struct cxl_port *port, nr_targets_inc = true; } } else { - cxl_rr = alloc_region_ref(port, cxlr, cxled); + struct cxl_decoder *cxld; + + cxld = cxl_port_pick_region_decoder(port, cxled, cxlr); + if (!cxld) { + dev_dbg(&cxlr->dev, "%s: no decoder available\n", + dev_name(&port->dev)); + return -EBUSY; + } + + cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld); if (IS_ERR(cxl_rr)) { dev_dbg(&cxlr->dev, "%s: failed to allocate region reference\n", @@ -1095,12 +1137,32 @@ static int cxl_port_attach_region(struct cxl_port *port, } nr_targets_inc = true; - rc = cxl_rr_alloc_decoder(port, cxlr, cxled, cxl_rr); + rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld); if (rc) goto out_erase; } cxld = cxl_rr->decoder; + /* + * the number of targets should not exceed the target_count + * of the decoder + */ + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd; + + cxlsd = to_cxl_switch_decoder(&cxld->dev); + if (cxl_rr->nr_targets > cxlsd->nr_targets) { + dev_dbg(&cxlr->dev, + "%s:%s %s add: %s:%s @ %d overflows targets: %d\n", + dev_name(port->uport_dev), dev_name(&port->dev), + dev_name(&cxld->dev), dev_name(&cxlmd->dev), + dev_name(&cxled->cxld.dev), pos, + cxlsd->nr_targets); + rc = -ENXIO; + goto out_erase; + } + } + rc = cxl_rr_ep_add(cxl_rr, cxled); if (rc) { dev_dbg(&cxlr->dev, @@ -1210,6 +1272,50 @@ static int check_last_peer(struct cxl_endpoint_decoder *cxled, return 0; } +static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + unsigned int interleave_mask; + u8 eiw; + u16 eig; + int high_pos, low_pos; + + if (!test_bit(iw, &cxlhdm->iw_cap_mask)) + return -ENXIO; + /* + * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection), + * if eiw < 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw] + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + * + * if eiw >= 8: + * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3 + * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0] + * + * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the + * interleave bits are none. + */ + ways_to_eiw(iw, &eiw); + if (eiw == 0 || eiw == 8) + return 0; + + granularity_to_eig(ig, &eig); + if (eiw > 8) + high_pos = eiw + eig - 1; + else + high_pos = eiw + eig + 7; + low_pos = eig + 8; + interleave_mask = GENMASK(high_pos, low_pos); + if (interleave_mask & ~cxlhdm->interleave_mask) + return -ENXIO; + + return 0; +} + static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) @@ -1223,6 +1329,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region_params *p = &cxlr->params; struct cxl_decoder *cxld = cxl_rr->decoder; struct cxl_switch_decoder *cxlsd; + struct cxl_port *iter = port; u16 eig, peig; u8 eiw, peiw; @@ -1239,16 +1346,26 @@ static int cxl_port_setup_targets(struct cxl_port *port, cxlsd = to_cxl_switch_decoder(&cxld->dev); if (cxl_rr->nr_targets_set) { - int i, distance; + int i, distance = 1; + struct cxl_region_ref *cxl_rr_iter; /* - * Passthrough decoders impose no distance requirements between - * peers + * The "distance" between peer downstream ports represents which + * endpoint positions in the region interleave a given port can + * host. + * + * For example, at the root of a hierarchy the distance is + * always 1 as every index targets a different host-bridge. At + * each subsequent switch level those ports map every Nth region + * position where N is the width of the switch == distance. */ - if (cxl_rr->nr_targets == 1) - distance = 0; - else - distance = p->nr_targets / cxl_rr->nr_targets; + do { + cxl_rr_iter = cxl_rr_load(iter, cxlr); + distance *= cxl_rr_iter->nr_targets; + iter = to_cxl_port(iter->dev.parent); + } while (!is_cxl_root(iter)); + distance *= cxlrd->cxlsd.cxld.interleave_ways; + for (i = 0; i < cxl_rr->nr_targets_set; i++) if (ep->dport == cxlsd->target[i]) { rc = check_last_peer(cxled, ep, cxl_rr, @@ -1340,9 +1457,8 @@ static int cxl_port_setup_targets(struct cxl_port *port, if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) { if (cxld->interleave_ways != iw || - cxld->interleave_granularity != ig || - cxld->hpa_range.start != p->res->start || - cxld->hpa_range.end != p->res->end || + (iw > 1 && cxld->interleave_granularity != ig) || + !region_res_match_cxl_range(p, &cxld->hpa_range) || ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) { dev_err(&cxlr->dev, "%s:%s %s expected iw: %d ig: %d %pr\n", @@ -1360,6 +1476,15 @@ static int cxl_port_setup_targets(struct cxl_port *port, return -ENXIO; } } else { + rc = check_interleave_cap(cxld, iw, ig); + if (rc) { + dev_dbg(&cxlr->dev, + "%s:%s iw: %d ig: %d is not supported\n", + dev_name(port->uport_dev), + dev_name(&port->dev), iw, ig); + return rc; + } + cxld->interleave_ways = iw; cxld->interleave_granularity = ig; cxld->hpa_range = (struct range) { @@ -1559,10 +1684,13 @@ static int cxl_region_attach_position(struct cxl_region *cxlr, const struct cxl_dport *dport, int pos) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; + struct cxl_decoder *cxld = &cxlsd->cxld; + int iw = cxld->interleave_ways; struct cxl_port *iter; int rc; - if (cxlrd->calc_hb(cxlrd, pos) != dport) { + if (dport != cxlrd->cxlsd.target[pos % iw]) { dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), dev_name(&cxlrd->cxlsd.cxld.dev)); @@ -1631,17 +1759,12 @@ static int cmp_interleave_pos(const void *a, const void *b) return cxled_a->pos - cxled_b->pos; } -static struct cxl_port *next_port(struct cxl_port *port) -{ - if (!port->parent_dport) - return NULL; - return port->parent_dport->port; -} - -static int match_switch_decoder_by_range(struct device *dev, void *data) +static int match_switch_decoder_by_range(struct device *dev, + const void *data) { struct cxl_switch_decoder *cxlsd; - struct range *r1, *r2 = data; + const struct range *r1, *r2 = data; + if (!is_switch_decoder(dev)) return 0; @@ -1662,7 +1785,7 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range, struct device *dev; int rc = -ENXIO; - parent = next_port(port); + parent = parent_port_of(port); if (!parent) return rc; @@ -1686,6 +1809,13 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range, } put_device(dev); + if (rc) + dev_err(port->uport_dev, + "failed to find %s:%s in target list of %s\n", + dev_name(&port->dev), + dev_name(port->parent_dport->dport_dev), + dev_name(&cxlsd->cxld.dev)); + return rc; } @@ -1742,7 +1872,7 @@ static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled) */ /* Iterate from endpoint to root_port refining the position */ - for (iter = port; iter; iter = next_port(iter)) { + for (iter = port; iter; iter = parent_port_of(iter)) { if (is_cxl_root(iter)) break; @@ -1791,27 +1921,39 @@ static int cxl_region_attach(struct cxl_region *cxlr, { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_region_params *p = &cxlr->params; struct cxl_port *ep_port, *root_port; struct cxl_dport *dport; int rc = -ENXIO; - if (cxled->mode != cxlr->mode) { - dev_dbg(&cxlr->dev, "%s region mode: %d mismatch: %d\n", - dev_name(&cxled->cxld.dev), cxlr->mode, cxled->mode); - return -EINVAL; + rc = check_interleave_cap(&cxled->cxld, p->interleave_ways, + p->interleave_granularity); + if (rc) { + dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n", + dev_name(&cxled->cxld.dev), p->interleave_ways, + p->interleave_granularity); + return rc; } - if (cxled->mode == CXL_DECODER_DEAD) { + if (cxled->part < 0) { dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); return -ENODEV; } + if (cxlds->part[cxled->part].mode != cxlr->mode) { + dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n", + dev_name(&cxled->cxld.dev), cxlr->mode); + return -EINVAL; + } + /* all full of members, or interleave config not established? */ if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_dbg(&cxlr->dev, "region already active\n"); return -EBUSY; - } else if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) { + } + + if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_dbg(&cxlr->dev, "interleave config missing\n"); return -ENXIO; } @@ -1845,13 +1987,13 @@ static int cxl_region_attach(struct cxl_region *cxlr, return -ENXIO; } - if (resource_size(cxled->dpa_res) * p->interleave_ways != + if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size != resource_size(p->res)) { dev_dbg(&cxlr->dev, - "%s:%s: decoder-size-%#llx * ways-%d != region-size-%#llx\n", + "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), (u64)resource_size(cxled->dpa_res), p->interleave_ways, - (u64)resource_size(p->res)); + (u64)p->cache_size, (u64)resource_size(p->res)); return -EINVAL; } @@ -1897,6 +2039,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, * then the region is already committed. */ p->state = CXL_CONFIG_COMMIT; + cxl_region_shared_upstream_bandwidth_update(cxlr); return 0; } @@ -1918,6 +2061,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, if (rc) return rc; p->state = CXL_CONFIG_ACTIVE; + cxl_region_shared_upstream_bandwidth_update(cxlr); } cxled->cxld.interleave_ways = p->interleave_ways; @@ -1966,13 +2110,7 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled) get_device(&cxlr->dev); if (p->state > CXL_CONFIG_ACTIVE) { - /* - * TODO: tear down all impacted regions if a device is - * removed out of order - */ - rc = cxl_region_decode_reset(cxlr, p->interleave_ways); - if (rc) - goto out; + cxl_region_decode_reset(cxlr, p->interleave_ways); p->state = CXL_CONFIG_ACTIVE; } @@ -2013,7 +2151,7 @@ out: void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) { down_write(&cxl_region_rwsem); - cxled->mode = CXL_DECODER_DEAD; + cxled->part = -1; cxl_region_detach(cxled); up_write(&cxl_region_rwsem); } @@ -2035,6 +2173,12 @@ static int attach_target(struct cxl_region *cxlr, rc = cxl_region_attach(cxlr, cxled, pos); up_read(&cxl_dpa_rwsem); up_write(&cxl_region_rwsem); + + if (rc) + dev_warn(cxled->cxld.dev.parent, + "failed to attach %s to %s: %d\n", + dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc); + return rc; } @@ -2210,7 +2354,7 @@ bool is_cxl_region(struct device *dev) { return dev->type == &cxl_region_type; } -EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL); +EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL"); static struct cxl_region *to_cxl_region(struct device *dev) { @@ -2227,7 +2371,6 @@ static void unregister_region(void *_cxlr) struct cxl_region_params *p = &cxlr->params; int i; - unregister_memory_notifier(&cxlr->memory_notifier); device_del(&cxlr->dev); /* @@ -2309,9 +2452,6 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, { struct cxl_region *cxlr = container_of(nb, struct cxl_region, memory_notifier); - struct cxl_region_params *p = &cxlr->params; - struct cxl_endpoint_decoder *cxled = p->targets[0]; - struct cxl_decoder *cxld = &cxled->cxld; struct memory_notify *mnb = arg; int nid = mnb->status_change_nid; int region_nid; @@ -2319,7 +2459,11 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid == NUMA_NO_NODE || action != MEM_ONLINE) return NOTIFY_DONE; - region_nid = phys_to_target_node(cxld->hpa_range.start); + /* + * No need to hold cxl_region_rwsem; region parameters are stable + * within the cxl_region driver. + */ + region_nid = phys_to_target_node(cxlr->params.res->start); if (nid != region_nid) return NOTIFY_DONE; @@ -2329,6 +2473,31 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, return NOTIFY_OK; } +static int cxl_region_calculate_adistance(struct notifier_block *nb, + unsigned long nid, void *data) +{ + struct cxl_region *cxlr = container_of(nb, struct cxl_region, + adist_notifier); + struct access_coordinate *perf; + int *adist = data; + int region_nid; + + /* + * No need to hold cxl_region_rwsem; region parameters are stable + * within the cxl_region driver. + */ + region_nid = phys_to_target_node(cxlr->params.res->start); + if (nid != region_nid) + return NOTIFY_OK; + + perf = &cxlr->coord[ACCESS_COORDINATE_CPU]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + /** * devm_cxl_add_region - Adds a region to a decoder * @cxlrd: root decoder @@ -2344,7 +2513,7 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, */ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, int id, - enum cxl_decoder_mode mode, + enum cxl_partition_mode mode, enum cxl_decoder_type type) { struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); @@ -2352,15 +2521,6 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, struct device *dev; int rc; - switch (mode) { - case CXL_DECODER_RAM: - case CXL_DECODER_PMEM: - break; - default: - dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); - return ERR_PTR(-EINVAL); - } - cxlr = cxl_region_alloc(cxlrd, id); if (IS_ERR(cxlr)) return cxlr; @@ -2376,10 +2536,6 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, if (rc) goto err; - cxlr->memory_notifier.notifier_call = cxl_region_perf_attrs_callback; - cxlr->memory_notifier.priority = CXL_CALLBACK_PRI; - register_memory_notifier(&cxlr->memory_notifier); - rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr); if (rc) return ERR_PTR(rc); @@ -2411,10 +2567,19 @@ static ssize_t create_ram_region_show(struct device *dev, } static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, - enum cxl_decoder_mode mode, int id) + enum cxl_partition_mode mode, int id) { int rc; + switch (mode) { + case CXL_PARTMODE_RAM: + case CXL_PARTMODE_PMEM: + break; + default: + dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode); + return ERR_PTR(-EINVAL); + } + rc = memregion_alloc(GFP_KERNEL); if (rc < 0) return ERR_PTR(rc); @@ -2427,9 +2592,8 @@ static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd, return devm_cxl_add_region(cxlrd, id, mode, CXL_DECODER_HOSTONLYMEM); } -static ssize_t create_pmem_region_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t len) +static ssize_t create_region_store(struct device *dev, const char *buf, + size_t len, enum cxl_partition_mode mode) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); struct cxl_region *cxlr; @@ -2439,31 +2603,26 @@ static ssize_t create_pmem_region_store(struct device *dev, if (rc != 1) return -EINVAL; - cxlr = __create_region(cxlrd, CXL_DECODER_PMEM, id); + cxlr = __create_region(cxlrd, mode, id); if (IS_ERR(cxlr)) return PTR_ERR(cxlr); return len; } + +static ssize_t create_pmem_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM); +} DEVICE_ATTR_RW(create_pmem_region); static ssize_t create_ram_region_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); - struct cxl_region *cxlr; - int rc, id; - - rc = sscanf(buf, "region%d\n", &id); - if (rc != 1) - return -EINVAL; - - cxlr = __create_region(cxlrd, CXL_DECODER_RAM, id); - if (IS_ERR(cxlr)) - return PTR_ERR(cxlr); - - return len; + return create_region_store(dev, buf, len, CXL_PARTMODE_RAM); } DEVICE_ATTR_RW(create_ram_region); @@ -2548,7 +2707,7 @@ bool is_cxl_pmem_region(struct device *dev) { return dev->type == &cxl_pmem_region_type; } -EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, CXL); +EXPORT_SYMBOL_NS_GPL(is_cxl_pmem_region, "CXL"); struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) { @@ -2557,11 +2716,11 @@ struct cxl_pmem_region *to_cxl_pmem_region(struct device *dev) return NULL; return container_of(dev, struct cxl_pmem_region, dev); } -EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, CXL); +EXPORT_SYMBOL_NS_GPL(to_cxl_pmem_region, "CXL"); struct cxl_poison_context { struct cxl_port *port; - enum cxl_decoder_mode mode; + int part; u64 offset; }; @@ -2569,47 +2728,45 @@ static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd, struct cxl_poison_context *ctx) { struct cxl_dev_state *cxlds = cxlmd->cxlds; + const struct resource *res; + struct resource *p, *last; u64 offset, length; int rc = 0; + if (ctx->part < 0) + return 0; + /* - * Collect poison for the remaining unmapped resources - * after poison is collected by committed endpoints. - * - * Knowing that PMEM must always follow RAM, get poison - * for unmapped resources based on the last decoder's mode: - * ram: scan remains of ram range, then any pmem range - * pmem: scan remains of pmem range + * Collect poison for the remaining unmapped resources after + * poison is collected by committed endpoints decoders. */ - - if (ctx->mode == CXL_DECODER_RAM) { - offset = ctx->offset; - length = resource_size(&cxlds->ram_res) - offset; + for (int i = ctx->part; i < cxlds->nr_partitions; i++) { + res = &cxlds->part[i].res; + for (p = res->child, last = NULL; p; p = p->sibling) + last = p; + if (last) + offset = last->end + 1; + else + offset = res->start; + length = res->end - offset + 1; + if (!length) + break; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT) - rc = 0; + if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM) + continue; if (rc) - return rc; - } - if (ctx->mode == CXL_DECODER_PMEM) { - offset = ctx->offset; - length = resource_size(&cxlds->dpa_res) - offset; - if (!length) - return 0; - } else if (resource_size(&cxlds->pmem_res)) { - offset = cxlds->pmem_res.start; - length = resource_size(&cxlds->pmem_res); - } else { - return 0; + break; } - return cxl_mem_get_poison(cxlmd, offset, length, NULL); + return rc; } static int poison_by_decoder(struct device *dev, void *arg) { struct cxl_poison_context *ctx = arg; struct cxl_endpoint_decoder *cxled; + enum cxl_partition_mode mode; + struct cxl_dev_state *cxlds; struct cxl_memdev *cxlmd; u64 offset, length; int rc = 0; @@ -2618,27 +2775,18 @@ static int poison_by_decoder(struct device *dev, void *arg) return rc; cxled = to_cxl_endpoint_decoder(dev); - if (!cxled->dpa_res || !resource_size(cxled->dpa_res)) - return rc; - - /* - * Regions are only created with single mode decoders: pmem or ram. - * Linux does not support mixed mode decoders. This means that - * reading poison per endpoint decoder adheres to the requirement - * that poison reads of pmem and ram must be separated. - * CXL 3.0 Spec 8.2.9.8.4.1 - */ - if (cxled->mode == CXL_DECODER_MIXED) { - dev_dbg(dev, "poison list read unsupported in mixed mode\n"); + if (!cxled->dpa_res) return rc; - } cxlmd = cxled_to_memdev(cxled); + cxlds = cxlmd->cxlds; + mode = cxlds->part[cxled->part].mode; + if (cxled->skip) { offset = cxled->dpa_res->start - cxled->skip; length = cxled->skip; rc = cxl_mem_get_poison(cxlmd, offset, length, NULL); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2647,7 +2795,7 @@ static int poison_by_decoder(struct device *dev, void *arg) offset = cxled->dpa_res->start; length = cxled->dpa_res->end - offset + 1; rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region); - if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM) + if (rc == -EFAULT && mode == CXL_PARTMODE_RAM) rc = 0; if (rc) return rc; @@ -2655,7 +2803,7 @@ static int poison_by_decoder(struct device *dev, void *arg) /* Iterate until commit_end is reached */ if (cxled->cxld.id == ctx->port->commit_end) { ctx->offset = cxled->dpa_res->end + 1; - ctx->mode = cxled->mode; + ctx->part = cxled->part; return 1; } @@ -2668,7 +2816,8 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port) int rc = 0; ctx = (struct cxl_poison_context) { - .port = port + .port = port, + .part = -1, }; rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder); @@ -2679,28 +2828,167 @@ int cxl_get_poison_by_endpoint(struct cxl_port *port) return rc; } +struct cxl_dpa_to_region_context { + struct cxl_region *cxlr; + u64 dpa; +}; + +static int __cxl_dpa_to_region(struct device *dev, void *arg) +{ + struct cxl_dpa_to_region_context *ctx = arg; + struct cxl_endpoint_decoder *cxled; + struct cxl_region *cxlr; + u64 dpa = ctx->dpa; + + if (!is_endpoint_decoder(dev)) + return 0; + + cxled = to_cxl_endpoint_decoder(dev); + if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res)) + return 0; + + if (dpa > cxled->dpa_res->end || dpa < cxled->dpa_res->start) + return 0; + + /* + * Stop the region search (return 1) when an endpoint mapping is + * found. The region may not be fully constructed so offering + * the cxlr in the context structure is not guaranteed. + */ + cxlr = cxled->cxld.region; + if (cxlr) + dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa, + dev_name(&cxlr->dev)); + else + dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa, + dev_name(dev)); + + ctx->cxlr = cxlr; + + return 1; +} + +struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa) +{ + struct cxl_dpa_to_region_context ctx; + struct cxl_port *port; + + ctx = (struct cxl_dpa_to_region_context) { + .dpa = dpa, + }; + port = cxlmd->endpoint; + if (port && is_cxl_endpoint(port) && cxl_num_decoders_committed(port)) + device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region); + + return ctx.cxlr; +} + +static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + int gran = p->interleave_granularity; + int ways = p->interleave_ways; + u64 offset; + + /* Is the hpa in an expected chunk for its pos(-ition) */ + offset = hpa - p->res->start; + offset = do_div(offset, gran * ways); + if ((offset >= pos * gran) && (offset < (pos + 1) * gran)) + return true; + + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa); + + return false; +} + +u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, + u64 dpa) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + u64 dpa_offset, hpa_offset, bits_upper, mask_upper, hpa; + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled = NULL; + u16 eig = 0; + u8 eiw = 0; + int pos; + + for (int i = 0; i < p->nr_targets; i++) { + cxled = p->targets[i]; + if (cxlmd == cxled_to_memdev(cxled)) + break; + } + if (!cxled || cxlmd != cxled_to_memdev(cxled)) + return ULLONG_MAX; + + pos = cxled->pos; + ways_to_eiw(p->interleave_ways, &eiw); + granularity_to_eig(p->interleave_granularity, &eig); + + /* + * The device position in the region interleave set was removed + * from the offset at HPA->DPA translation. To reconstruct the + * HPA, place the 'pos' in the offset. + * + * The placement of 'pos' in the HPA is determined by interleave + * ways and granularity and is defined in the CXL Spec 3.0 Section + * 8.2.4.19.13 Implementation Note: Device Decode Logic + */ + + /* Remove the dpa base */ + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + + mask_upper = GENMASK_ULL(51, eig + 8); + + if (eiw < 8) { + hpa_offset = (dpa_offset & mask_upper) << eiw; + hpa_offset |= pos << (eig + 8); + } else { + bits_upper = (dpa_offset & mask_upper) >> (eig + 8); + bits_upper = bits_upper * 3; + hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8); + } + + /* The lower bits remain unchanged */ + hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0); + + /* Apply the hpa_offset to the region base address */ + hpa = hpa_offset + p->res->start + p->cache_size; + + /* Root decoder translation overrides typical modulo decode */ + if (cxlrd->hpa_to_spa) + hpa = cxlrd->hpa_to_spa(cxlrd, hpa); + + if (hpa < p->res->start || hpa > p->res->end) { + dev_dbg(&cxlr->dev, + "Addr trans fail: hpa 0x%llx not in region\n", hpa); + return ULLONG_MAX; + } + + /* Simple chunk check, by pos & gran, only applies to modulo decodes */ + if (!cxlrd->hpa_to_spa && (!cxl_is_hpa_in_chunk(hpa, cxlr, pos))) + return ULLONG_MAX; + + return hpa; +} + static struct lock_class_key cxl_pmem_region_key; -static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) +static int cxl_pmem_region_alloc(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; struct cxl_nvdimm_bridge *cxl_nvb; - struct cxl_pmem_region *cxlr_pmem; struct device *dev; int i; - down_read(&cxl_region_rwsem); - if (p->state != CXL_CONFIG_COMMIT) { - cxlr_pmem = ERR_PTR(-ENXIO); - goto out; - } + guard(rwsem_read)(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) + return -ENXIO; - cxlr_pmem = kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), - GFP_KERNEL); - if (!cxlr_pmem) { - cxlr_pmem = ERR_PTR(-ENOMEM); - goto out; - } + struct cxl_pmem_region *cxlr_pmem __free(kfree) = + kzalloc(struct_size(cxlr_pmem, mapping, p->nr_targets), GFP_KERNEL); + if (!cxlr_pmem) + return -ENOMEM; cxlr_pmem->hpa_range.start = p->res->start; cxlr_pmem->hpa_range.end = p->res->end; @@ -2717,11 +3005,9 @@ static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) * bridge for one device is the same for all. */ if (i == 0) { - cxl_nvb = cxl_find_nvdimm_bridge(cxlmd); - if (!cxl_nvb) { - cxlr_pmem = ERR_PTR(-ENODEV); - goto out; - } + cxl_nvb = cxl_find_nvdimm_bridge(cxlmd->endpoint); + if (!cxl_nvb) + return -ENODEV; cxlr->cxl_nvb = cxl_nvb; } m->cxlmd = cxlmd; @@ -2732,18 +3018,16 @@ static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) } dev = &cxlr_pmem->dev; - cxlr_pmem->cxlr = cxlr; - cxlr->cxlr_pmem = cxlr_pmem; device_initialize(dev); lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); device_set_pm_not_required(dev); dev->parent = &cxlr->dev; dev->bus = &cxl_bus_type; dev->type = &cxl_pmem_region_type; -out: - up_read(&cxl_region_rwsem); + cxlr_pmem->cxlr = cxlr; + cxlr->cxlr_pmem = no_free_ptr(cxlr_pmem); - return cxlr_pmem; + return 0; } static void cxl_dax_region_release(struct device *dev) @@ -2776,7 +3060,7 @@ struct cxl_dax_region *to_cxl_dax_region(struct device *dev) return NULL; return container_of(dev, struct cxl_dax_region, dev); } -EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, CXL); +EXPORT_SYMBOL_NS_GPL(to_cxl_dax_region, "CXL"); static struct lock_class_key cxl_dax_region_key; @@ -2786,17 +3070,13 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) struct cxl_dax_region *cxlr_dax; struct device *dev; - down_read(&cxl_region_rwsem); - if (p->state != CXL_CONFIG_COMMIT) { - cxlr_dax = ERR_PTR(-ENXIO); - goto out; - } + guard(rwsem_read)(&cxl_region_rwsem); + if (p->state != CXL_CONFIG_COMMIT) + return ERR_PTR(-ENXIO); cxlr_dax = kzalloc(sizeof(*cxlr_dax), GFP_KERNEL); - if (!cxlr_dax) { - cxlr_dax = ERR_PTR(-ENOMEM); - goto out; - } + if (!cxlr_dax) + return ERR_PTR(-ENOMEM); cxlr_dax->hpa_range.start = p->res->start; cxlr_dax->hpa_range.end = p->res->end; @@ -2809,8 +3089,6 @@ static struct cxl_dax_region *cxl_dax_region_alloc(struct cxl_region *cxlr) dev->parent = &cxlr->dev; dev->bus = &cxl_bus_type; dev->type = &cxl_dax_region_type; -out: - up_read(&cxl_region_rwsem); return cxlr_dax; } @@ -2838,11 +3116,11 @@ static void cxlr_release_nvdimm(void *_cxlr) struct cxl_region *cxlr = _cxlr; struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; - device_lock(&cxl_nvb->dev); - if (cxlr->cxlr_pmem) - devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, - cxlr->cxlr_pmem); - device_unlock(&cxl_nvb->dev); + scoped_guard(device, &cxl_nvb->dev) { + if (cxlr->cxlr_pmem) + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, + cxlr->cxlr_pmem); + } cxlr->cxl_nvb = NULL; put_device(&cxl_nvb->dev); } @@ -2860,9 +3138,10 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) struct device *dev; int rc; - cxlr_pmem = cxl_pmem_region_alloc(cxlr); - if (IS_ERR(cxlr_pmem)) - return PTR_ERR(cxlr_pmem); + rc = cxl_pmem_region_alloc(cxlr); + if (rc) + return rc; + cxlr_pmem = cxlr->cxlr_pmem; cxl_nvb = cxlr->cxl_nvb; dev = &cxlr_pmem->dev; @@ -2877,13 +3156,14 @@ static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), dev_name(dev)); - device_lock(&cxl_nvb->dev); - if (cxl_nvb->dev.driver) - rc = devm_add_action_or_reset(&cxl_nvb->dev, - cxlr_pmem_unregister, cxlr_pmem); - else - rc = -ENXIO; - device_unlock(&cxl_nvb->dev); + scoped_guard(device, &cxl_nvb->dev) { + if (cxl_nvb->dev.driver) + rc = devm_add_action_or_reset(&cxl_nvb->dev, + cxlr_pmem_unregister, + cxlr_pmem); + else + rc = -ENXIO; + } if (rc) goto err_bridge; @@ -2935,25 +3215,54 @@ err: return rc; } -static int match_root_decoder_by_range(struct device *dev, void *data) +static int match_decoder_by_range(struct device *dev, const void *data) { - struct range *r1, *r2 = data; - struct cxl_root_decoder *cxlrd; + const struct range *r1, *r2 = data; + struct cxl_decoder *cxld; - if (!is_root_decoder(dev)) + if (!is_switch_decoder(dev)) return 0; - cxlrd = to_cxl_root_decoder(dev); - r1 = &cxlrd->cxlsd.cxld.hpa_range; + cxld = to_cxl_decoder(dev); + r1 = &cxld->hpa_range; return range_contains(r1, r2); } -static int match_region_by_range(struct device *dev, void *data) +static struct cxl_decoder * +cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa) +{ + struct device *cxld_dev = device_find_child(&port->dev, hpa, + match_decoder_by_range); + + return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL; +} + +static struct cxl_root_decoder * +cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); + struct cxl_decoder *root, *cxld = &cxled->cxld; + struct range *hpa = &cxld->hpa_range; + + root = cxl_port_find_switch_decoder(&cxl_root->port, hpa); + if (!root) { + dev_err(cxlmd->dev.parent, + "%s:%s no CXL window for range %#llx:%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxld->dev), + cxld->hpa_range.start, cxld->hpa_range.end); + return NULL; + } + + return to_cxl_root_decoder(&root->dev); +} + +static int match_region_by_range(struct device *dev, const void *data) { struct cxl_region_params *p; struct cxl_region *cxlr; - struct range *r = data; - int rc = 0; + const struct range *r = data; if (!is_cxl_region(dev)) return 0; @@ -2961,60 +3270,96 @@ static int match_region_by_range(struct device *dev, void *data) cxlr = to_cxl_region(dev); p = &cxlr->params; - down_read(&cxl_region_rwsem); + guard(rwsem_read)(&cxl_region_rwsem); if (p->res && p->res->start == r->start && p->res->end == r->end) - rc = 1; - up_read(&cxl_region_rwsem); + return 1; - return rc; + return 0; } -/* Establish an empty region covering the given HPA range */ -static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, - struct cxl_endpoint_decoder *cxled) +static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, + struct resource *res) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + int nid = phys_to_target_node(res->start); + resource_size_t size = resource_size(res); + resource_size_t cache_size, start; + int rc; + + rc = cxl_acpi_get_extended_linear_cache_size(res, nid, &cache_size); + if (rc) + return rc; + + if (!cache_size) + return 0; + + if (size != cache_size) { + dev_warn(&cxlr->dev, + "Extended Linear Cache size %pa != CXL size %pa. No Support!", + &cache_size, &size); + return -ENXIO; + } + + /* + * Move the start of the range to where the cache range starts. The + * implementation assumes that the cache range is in front of the + * CXL range. This is not dictated by the HMAT spec but is how the + * current known implementation is configured. + * + * The cache range is expected to be within the CFMWS. The adjusted + * res->start should not be less than cxlrd->res->start. + */ + start = res->start - cache_size; + if (start < cxlrd->res->start) + return -ENXIO; + + res->start = start; + p->cache_size = cache_size; + + return 0; +} + +static int __construct_region(struct cxl_region *cxlr, + struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct cxl_port *port = cxlrd_to_port(cxlrd); struct range *hpa = &cxled->cxld.hpa_range; struct cxl_region_params *p; - struct cxl_region *cxlr; struct resource *res; int rc; - do { - cxlr = __create_region(cxlrd, cxled->mode, - atomic_read(&cxlrd->region_id)); - } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); - - if (IS_ERR(cxlr)) { - dev_err(cxlmd->dev.parent, - "%s:%s: %s failed assign region: %ld\n", - dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), - __func__, PTR_ERR(cxlr)); - return cxlr; - } - - down_write(&cxl_region_rwsem); + guard(rwsem_write)(&cxl_region_rwsem); p = &cxlr->params; if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { dev_err(cxlmd->dev.parent, "%s:%s: %s autodiscovery interrupted\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__); - rc = -EBUSY; - goto err; + return -EBUSY; } set_bit(CXL_REGION_F_AUTO, &cxlr->flags); res = kmalloc(sizeof(*res), GFP_KERNEL); - if (!res) { - rc = -ENOMEM; - goto err; - } + if (!res) + return -ENOMEM; *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), dev_name(&cxlr->dev)); + + rc = cxl_extended_linear_cache_resize(cxlr, res); + if (rc && rc != -EOPNOTSUPP) { + /* + * Failing to support extended linear cache region resize does not + * prevent the region from functioning. Only causes cxl list showing + * incorrect region size. + */ + dev_warn(cxlmd->dev.parent, + "Extended linear cache calculation failed rc:%d\n", rc); + } + rc = insert_resource(cxlrd->res, res); if (rc) { /* @@ -3034,7 +3379,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); if (rc) - goto err; + return rc; dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n", dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__, @@ -3043,57 +3388,81 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, /* ...to match put_device() in cxl_add_to_region() */ get_device(&cxlr->dev); - up_write(&cxl_region_rwsem); + + return 0; +} + +/* Establish an empty region covering the given HPA range */ +static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, + struct cxl_endpoint_decoder *cxled) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *port = cxlrd_to_port(cxlrd); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + int rc, part = READ_ONCE(cxled->part); + struct cxl_region *cxlr; + + do { + cxlr = __create_region(cxlrd, cxlds->part[part].mode, + atomic_read(&cxlrd->region_id)); + } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY); + + if (IS_ERR(cxlr)) { + dev_err(cxlmd->dev.parent, + "%s:%s: %s failed assign region: %ld\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + __func__, PTR_ERR(cxlr)); + return cxlr; + } + + rc = __construct_region(cxlr, cxlrd, cxled); + if (rc) { + devm_release_action(port->uport_dev, unregister_region, cxlr); + return ERR_PTR(rc); + } return cxlr; +} -err: - up_write(&cxl_region_rwsem); - devm_release_action(port->uport_dev, unregister_region, cxlr); - return ERR_PTR(rc); +static struct cxl_region * +cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa) +{ + struct device *region_dev; + + region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa, + match_region_by_range); + if (!region_dev) + return NULL; + + return to_cxl_region(region_dev); } -int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled) +int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) { - struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct range *hpa = &cxled->cxld.hpa_range; - struct cxl_decoder *cxld = &cxled->cxld; - struct device *cxlrd_dev, *region_dev; - struct cxl_root_decoder *cxlrd; struct cxl_region_params *p; - struct cxl_region *cxlr; bool attach = false; int rc; - cxlrd_dev = device_find_child(&root->dev, &cxld->hpa_range, - match_root_decoder_by_range); - if (!cxlrd_dev) { - dev_err(cxlmd->dev.parent, - "%s:%s no CXL window for range %#llx:%#llx\n", - dev_name(&cxlmd->dev), dev_name(&cxld->dev), - cxld->hpa_range.start, cxld->hpa_range.end); + struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) = + cxl_find_root_decoder(cxled); + if (!cxlrd) return -ENXIO; - } - - cxlrd = to_cxl_root_decoder(cxlrd_dev); /* * Ensure that if multiple threads race to construct_region() for @hpa * one does the construction and the others add to that. */ mutex_lock(&cxlrd->range_lock); - region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa, - match_region_by_range); - if (!region_dev) { + struct cxl_region *cxlr __free(put_cxl_region) = + cxl_find_region_by_range(cxlrd, hpa); + if (!cxlr) cxlr = construct_region(cxlrd, cxled); - region_dev = &cxlr->dev; - } else - cxlr = to_cxl_region(region_dev); mutex_unlock(&cxlrd->range_lock); rc = PTR_ERR_OR_ZERO(cxlr); if (rc) - goto out; + return rc; attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE); @@ -3113,12 +3482,37 @@ int cxl_add_to_region(struct cxl_port *root, struct cxl_endpoint_decoder *cxled) p->res); } - put_device(region_dev); -out: - put_device(cxlrd_dev); return rc; } -EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, CXL); +EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL"); + +u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa) +{ + struct cxl_region_ref *iter; + unsigned long index; + + if (!endpoint) + return ~0ULL; + + guard(rwsem_write)(&cxl_region_rwsem); + + xa_for_each(&endpoint->regions, index, iter) { + struct cxl_region_params *p = &iter->region->params; + + if (p->res->start <= spa && spa <= p->res->end) { + if (!p->cache_size) + return ~0ULL; + + if (spa >= p->res->start + p->cache_size) + return spa - p->cache_size; + + return spa + p->cache_size; + } + } + + return ~0ULL; +} +EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL"); static int is_system_ram(struct resource *res, void *arg) { @@ -3129,6 +3523,14 @@ static int is_system_ram(struct resource *res, void *arg) return 1; } +static void shutdown_notifiers(void *_cxlr) +{ + struct cxl_region *cxlr = _cxlr; + + unregister_memory_notifier(&cxlr->memory_notifier); + unregister_mt_adistance_algorithm(&cxlr->adist_notifier); +} + static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); @@ -3164,10 +3566,32 @@ out: if (rc) return rc; + cxlr->memory_notifier.notifier_call = cxl_region_perf_attrs_callback; + cxlr->memory_notifier.priority = CXL_CALLBACK_PRI; + register_memory_notifier(&cxlr->memory_notifier); + + cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance; + cxlr->adist_notifier.priority = 100; + register_mt_adistance_algorithm(&cxlr->adist_notifier); + + rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr); + if (rc) + return rc; + switch (cxlr->mode) { - case CXL_DECODER_PMEM: + case CXL_PARTMODE_PMEM: + rc = devm_cxl_region_edac_register(cxlr); + if (rc) + dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n", + cxlr->id); + return devm_cxl_add_pmem_region(cxlr); - case CXL_DECODER_RAM: + case CXL_PARTMODE_RAM: + rc = devm_cxl_region_edac_register(cxlr); + if (rc) + dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n", + cxlr->id); + /* * The region can not be manged by CXL if any portion of * it is already online as 'System RAM' @@ -3201,6 +3625,6 @@ void cxl_region_exit(void) cxl_driver_unregister(&cxl_region_driver); } -MODULE_IMPORT_NS(CXL); -MODULE_IMPORT_NS(DEVMEM); +MODULE_IMPORT_NS("CXL"); +MODULE_IMPORT_NS("DEVMEM"); MODULE_ALIAS_CXL(CXL_DEVICE_REGION); |