diff options
Diffstat (limited to 'arch/x86/kernel/cpu/resctrl/monitor.c')
-rw-r--r-- | arch/x86/kernel/cpu/resctrl/monitor.c | 782 |
1 files changed, 597 insertions, 185 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f136ac046851..94a1d9780461 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,9 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/cpu.h> #include <linux/module.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -23,8 +26,22 @@ #include <asm/resctrl.h> #include "internal.h" - +#include "trace.h" + +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -38,6 +55,13 @@ struct rmid_entry { static LIST_HEAD(rmid_free_lru); /* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that @@ -75,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) +static int snc_nodes_per_l3_cache = 1; + /* * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied @@ -136,17 +162,70 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } -static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +/* + * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by + * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is + * needed. The physical RMID is the same as the logical RMID. + * + * On a platform with SNC mode enabled, Linux enables RMID sharing mode + * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel + * Resource Director Technology Architecture Specification" for a full + * description of RMID sharing mode). + * + * In RMID sharing mode there are fewer "logical RMID" values available + * to accumulate data ("physical RMIDs" are divided evenly between SNC + * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * each SNC node. + * + * The value loaded into IA32_PQR_ASSOC is the "logical RMID". + * + * Data is collected independently on each SNC node and can be retrieved + * using the "physical RMID" value computed by this function and loaded + * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. + * + * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 + * cache. So a "physical RMID" may be read from any CPU that shares + * the L3 cache with the desired SNC node, not just from a CPU in + * the specific SNC node. + */ +static int logical_rmid_to_physical_rmid(int cpu, int lrmid) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + if (snc_nodes_per_l3_cache == 1) + return lrmid; + + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; +} + +static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) { u64 msr_val; @@ -158,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) * are error bits. */ - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); rdmsrl(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) @@ -170,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -189,18 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, return NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; + u32 prmid; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { memset(am, 0, sizeof(*am)); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); /* Record any initial, non-zero count value. */ - __rmid_read(rmid, eventid, &am->prev_msr); + __rmid_read_phys(prmid, eventid, &am->prev_msr); } } @@ -208,9 +291,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); if (is_mbm_total_enabled()) memset(hw_dom->arch_mbm_total, 0, @@ -229,19 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u64 msr_val, chunks; + u32 prmid; int ret; - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) - return -EINVAL; + resctrl_arch_rmid_read_context_check(); - ret = __rmid_read(rmid, eventid, &msr_val); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); if (ret) return ret; @@ -260,20 +346,40 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_domain *d, bool force_free) +void __check_limbo(struct rdt_mon_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -281,101 +387,187 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(nrmid); - - if (resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); } - crmid = nrmid + 1; + cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +bool has_busy_rmid(struct rdt_mon_domain *d) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; } /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_domain *d; - int cpu, err; - u64 val = 0; + struct rdt_mon_domain *d; + u32 idx; - entry->busy = 0; - cpu = get_cpu(); - list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } + lockdep_assert_held(&rdtgroup_mutex); + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); + + entry->busy = 0; + list_for_each_entry(d, &r->mon_domains, hdr.list) { /* * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) - rmid_limbo_count++; - else - list_add_tail(&entry->list, &rmid_free_lru); + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,44 +575,84 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; struct mbm_state *m; + int err, ret; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); - if (rr->err) - return rr->err; + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; } /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,10 +661,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; cur_bytes = rr->val; bytes = cur_bytes - m->prev_bw_bytes; @@ -440,14 +676,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) cur_bw = bytes / SZ_1M; - if (m->delta_comp) - m->delta_bw = abs(cur_bw - m->prev_bw); - m->delta_comp = false; m->prev_bw = cur_bw; } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) @@ -459,7 +692,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,7 +703,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -516,26 +750,27 @@ void mon_event_count(void *info) * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - u32 cur_bw, delta_bw, user_bw; + struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; struct rdt_resource *r_mba; - struct rdt_domain *dom_mba; struct list_head *head; struct rdtgroup *entry; - - if (!is_mbm_local_enabled()) - return; + u32 cur_bw, user_bw; r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + evt_id = rgrp->mba_mbps_event; closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; - dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { pr_warn_once("Failure to get domain for MBA update\n"); return; @@ -543,7 +778,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; - delta_bw = pmbm_data->delta_bw; /* MBA resource doesn't support CDP */ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); @@ -553,83 +787,76 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) */ head = &rgrp->mon.crdtgrp_list; list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; cur_bw += cmbm_data->prev_bw; - delta_bw += cmbm_data->delta_bw; } /* * Scale up/down the bandwidth linearly for the ctrl group. The * bandwidth step is the bandwidth granularity specified by the * hardware. - * - * The delta_bw is used when increasing the bandwidth so that we - * dont alternately increase and decrease the control values - * continuously. - * - * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if - * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep - * switching between 90 and 110 continuously if we only check - * cur_bw < user_bw. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". */ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { new_msr_val = cur_msr_val - r_mba->membw.bw_gran; } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw + delta_bw))) { + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { new_msr_val = cur_msr_val + r_mba->membw.bw_gran; } else { return; } resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); - - /* - * Delta values are updated dynamically package wise for each - * rdtgrp every time the throttle MSR changes value. - * - * This is because (1)the increase in bandwidth is not perfectly - * linear and only "approximately" linear even when the hardware - * says it is linear.(2)Also since MBA is a core specific - * mechanism, the delta values vary based on number of cores used - * by the rdtgrp. - */ - pmbm_data->delta_comp = true; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cmbm_data->delta_comp = true; - } } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) { - struct rmid_read rr; + struct rmid_read rr = {0}; - rr.first = false; rr.r = r; rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* - * This is protected from concurrent reads from user - * as both the user and we hold the global mutex. + * If the software controller is enabled, compute the + * bandwidth for this event id. */ - if (is_mbm_total_enabled()) { - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; - rr.val = 0; - __mon_event_count(rmid, &rr); - } - if (is_mbm_local_enabled()) { - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - rr.val = 0; - __mon_event_count(rmid, &rr); + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); - /* - * Call the MBA software controller only for the - * control groups and when user has enabled - * the software controller explicitly. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); - } + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} + +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + + if (is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -639,106 +866,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); - struct rdt_resource *r; - struct rdt_domain *d; + struct rdt_mon_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, cqm_limbo.work); + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); + struct rdt_mon_domain *d; struct list_head *head; struct rdt_resource *r; - struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, mbm_over.work); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int i, nr_rmids; + int err = 0, i; + u32 idx; - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; - for (i = 0; i < nr_rmids; i++) { + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); } static struct mon_evt llc_occupancy_event = { @@ -775,6 +1089,89 @@ static void l3_mon_evt_init(struct rdt_resource *r) list_add_tail(&mbm_local_event.list, &r->evt_list); } +/* + * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 + * which indicates that RMIDs are configured in legacy mode. + * This mode is incompatible with Linux resctrl semantics + * as RMIDs are partitioned between SNC nodes, which requires + * a user to know which RMID is allocated to a task. + * Clearing bit 0 reconfigures the RMID counters for use + * in RMID sharing mode. This mode is better for Linux. + * The RMID space is divided between all SNC nodes with the + * RMIDs renumbered to start from zero in each node when + * counting operations from tasks. Code to read the counters + * must adjust RMID counter numbers based on SNC node. See + * logical_rmid_to_physical_rmid() for code that does this. + */ +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + if (snc_nodes_per_l3_cache > 1) + msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); +} + +/* CPU models that support MSR_RMID_SNC_CONFIG */ +static const struct x86_cpu_id snc_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + {} +}; + +/* + * There isn't a simple hardware bit that indicates whether a CPU is running + * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the + * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in + * the same NUMA node as CPU0. + * It is not possible to accurately determine SNC state if the system is + * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes + * to L3 caches. It will be OK if system is booted with hyperthreading + * disabled (since this doesn't affect the ratio). + */ +static __init int snc_get_config(void) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); + const cpumask_t *node0_cpumask; + int cpus_per_node, cpus_per_l3; + int ret; + + if (!x86_match_cpu(snc_cpu_ids) || !ci) + return 1; + + cpus_read_lock(); + if (num_online_cpus() != num_present_cpus()) + pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); + cpus_read_unlock(); + + node0_cpumask = cpumask_of_node(cpu_to_node(0)); + + cpus_per_node = cpumask_weight(node0_cpumask); + cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); + + if (!cpus_per_node || !cpus_per_l3) + return 1; + + ret = cpus_per_l3 / cpus_per_node; + + /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ + switch (ret) { + case 1: + break; + case 2 ... 4: + case 6: + pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); + rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; + break; + default: + pr_warn("Ignore improbable SNC node count %d\n", ret); + ret = 1; + break; + } + + return ret; +} + int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -782,9 +1179,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int threshold; int ret; + snc_nodes_per_l3_cache = snc_get_config(); + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; - hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; - r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; + r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -813,13 +1212,21 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return ret; if (rdt_cpu_has(X86_FEATURE_BMEC)) { + u32 eax, ebx, ecx, edx; + + /* Detect list of bandwidth sources that can be tracked */ + cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); + hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; - mbm_config_rftype_init("mbm_total_bytes_config"); + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { mbm_local_event.configurable = true; - mbm_config_rftype_init("mbm_local_bytes_config"); + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } } @@ -830,6 +1237,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; |