arch/x86/kernel/cpu/resctrl/monitor.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Resource Director Technology(RDT)
 * - Monitoring code
 *
 * Copyright (C) 2017 Intel Corporation
 *
 * Author:
 *    Vikas Shivappa <vikas.shivappa@intel.com>
 *
 * This replaces the cqm.c based on perf but we reuse a lot of
 * code and datastructures originally from Peter Zijlstra and Matt Fleming.
 *
 * More information about RDT be found in the Intel (R) x86 Architecture
 * Software Developer Manual June 2016, volume 3, section 17.17.
 */

#define pr_fmt(fmt)	"resctrl: " fmt

#include <linux/cpu.h>
#include <linux/resctrl.h>

#include <asm/cpu_device_id.h>
#include <asm/msr.h>

#include "internal.h"

/*
 * Global boolean for rdt_monitor which is true if any
 * resource monitoring is enabled.
 */
bool rdt_mon_capable;

/*
 * Global to indicate which monitoring events are enabled.
 */
unsigned int rdt_mon_features;

#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))

static int snc_nodes_per_l3_cache = 1;

/*
 * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
 * If rmid > rmid threshold, MBM total and local values should be multiplied
 * by the correction factor.
 *
 * The original table is modified for better code:
 *
 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
 *    for the case.
 * 2. MBM total and local correction table indexed by core counter which is
 *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
 *    to calculate corrected value by shifting:
 *    corrected_value = (original_value * correction_factor) >> 20
 */
static const struct mbm_correction_factor_table {
	u32 rmidthreshold;
	u64 cf;
} mbm_cf_table[] __initconst = {
	{7,	CF(1.000000)},
	{15,	CF(1.000000)},
	{15,	CF(0.969650)},
	{31,	CF(1.000000)},
	{31,	CF(1.066667)},
	{31,	CF(0.969650)},
	{47,	CF(1.142857)},
	{63,	CF(1.000000)},
	{63,	CF(1.185115)},
	{63,	CF(1.066553)},
	{79,	CF(1.454545)},
	{95,	CF(1.000000)},
	{95,	CF(1.230769)},
	{95,	CF(1.142857)},
	{95,	CF(1.066667)},
	{127,	CF(1.000000)},
	{127,	CF(1.254863)},
	{127,	CF(1.185255)},
	{151,	CF(1.000000)},
	{127,	CF(1.066667)},
	{167,	CF(1.000000)},
	{159,	CF(1.454334)},
	{183,	CF(1.000000)},
	{127,	CF(0.969744)},
	{191,	CF(1.280246)},
	{191,	CF(1.230921)},
	{215,	CF(1.000000)},
	{191,	CF(1.143118)},
};

static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;

static u64 mbm_cf __read_mostly;

static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
{
	/* Correct MBM value. */
	if (rmid > mbm_cf_rmidthreshold)
		val = (val * mbm_cf) >> 20;

	return val;
}

/*
 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
 * needed. The physical RMID is the same as the logical RMID.
 *
 * On a platform with SNC mode enabled, Linux enables RMID sharing mode
 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
 * Resource Director Technology Architecture Specification" for a full
 * description of RMID sharing mode).
 *
 * In RMID sharing mode there are fewer "logical RMID" values available
 * to accumulate data ("physical RMIDs" are divided evenly between SNC
 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
 * each SNC node.
 *
 * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
 *
 * Data is collected independently on each SNC node and can be retrieved
 * using the "physical RMID" value computed by this function and loaded
 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
 *
 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
 * cache.  So a "physical RMID" may be read from any CPU that shares
 * the L3 cache with the desired SNC node, not just from a CPU in
 * the specific SNC node.
 */
static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
{
	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;

	if (snc_nodes_per_l3_cache == 1)
		return lrmid;

	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
}

static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
{
	u64 msr_val;

	/*
	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
	 * with a valid event code for supported resource type and the bits
	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
	 * are error bits.
	 */
	wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
	rdmsrq(MSR_IA32_QM_CTR, msr_val);

	if (msr_val & RMID_VAL_ERROR)
		return -EIO;
	if (msr_val & RMID_VAL_UNAVAIL)
		return -EINVAL;

	*val = msr_val;
	return 0;
}

static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
						 u32 rmid,
						 enum resctrl_event_id eventid)
{
	switch (eventid) {
	case QOS_L3_OCCUP_EVENT_ID:
		return NULL;
	case QOS_L3_MBM_TOTAL_EVENT_ID:
		return &hw_dom->arch_mbm_total[rmid];
	case QOS_L3_MBM_LOCAL_EVENT_ID:
		return &hw_dom->arch_mbm_local[rmid];
	default:
		/* Never expect to get here */
		WARN_ON_ONCE(1);
		return NULL;
	}
}

void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
			     u32 unused, u32 rmid,
			     enum resctrl_event_id eventid)
{
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
	int cpu = cpumask_any(&d->hdr.cpu_mask);
	struct arch_mbm_state *am;
	u32 prmid;

	am = get_arch_mbm_state(hw_dom, rmid, eventid);
	if (am) {
		memset(am, 0, sizeof(*am));

		prmid = logical_rmid_to_physical_rmid(cpu, rmid);
		/* Record any initial, non-zero count value. */
		__rmid_read_phys(prmid, eventid, &am->prev_msr);
	}
}

/*
 * Assumes that hardware counters are also reset and thus that there is
 * no need to record initial non-zero counts.
 */
void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
{
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);

	if (resctrl_arch_is_mbm_total_enabled())
		memset(hw_dom->arch_mbm_total, 0,
		       sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);

	if (resctrl_arch_is_mbm_local_enabled())
		memset(hw_dom->arch_mbm_local, 0,
		       sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}

static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
{
	u64 shift = 64 - width, chunks;

	chunks = (cur_msr << shift) - (prev_msr << shift);
	return chunks >> shift;
}

int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
			   u32 unused, u32 rmid, enum resctrl_event_id eventid,
			   u64 *val, void *ignored)
{
	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
	int cpu = cpumask_any(&d->hdr.cpu_mask);
	struct arch_mbm_state *am;
	u64 msr_val, chunks;
	u32 prmid;
	int ret;

	resctrl_arch_rmid_read_context_check();

	prmid = logical_rmid_to_physical_rmid(cpu, rmid);
	ret = __rmid_read_phys(prmid, eventid, &msr_val);
	if (ret)
		return ret;

	am = get_arch_mbm_state(hw_dom, rmid, eventid);
	if (am) {
		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
						 hw_res->mbm_width);
		chunks = get_corrected_mbm_count(rmid, am->chunks);
		am->prev_msr = msr_val;
	} else {
		chunks = msr_val;
	}

	*val = chunks * hw_res->mon_scale;

	return 0;
}

/*
 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
 * which indicates that RMIDs are configured in legacy mode.
 * This mode is incompatible with Linux resctrl semantics
 * as RMIDs are partitioned between SNC nodes, which requires
 * a user to know which RMID is allocated to a task.
 * Clearing bit 0 reconfigures the RMID counters for use
 * in RMID sharing mode. This mode is better for Linux.
 * The RMID space is divided between all SNC nodes with the
 * RMIDs renumbered to start from zero in each node when
 * counting operations from tasks. Code to read the counters
 * must adjust RMID counter numbers based on SNC node. See
 * logical_rmid_to_physical_rmid() for code that does this.
 */
void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
{
	if (snc_nodes_per_l3_cache > 1)
		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
}

/* CPU models that support MSR_RMID_SNC_CONFIG */
static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
	X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
	{}
};

/*
 * There isn't a simple hardware bit that indicates whether a CPU is running
 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
 * the same NUMA node as CPU0.
 * It is not possible to accurately determine SNC state if the system is
 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
 * to L3 caches. It will be OK if system is booted with hyperthreading
 * disabled (since this doesn't affect the ratio).
 */
static __init int snc_get_config(void)
{
	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
	const cpumask_t *node0_cpumask;
	int cpus_per_node, cpus_per_l3;
	int ret;

	if (!x86_match_cpu(snc_cpu_ids) || !ci)
		return 1;

	cpus_read_lock();
	if (num_online_cpus() != num_present_cpus())
		pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
	cpus_read_unlock();

	node0_cpumask = cpumask_of_node(cpu_to_node(0));

	cpus_per_node = cpumask_weight(node0_cpumask);
	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);

	if (!cpus_per_node || !cpus_per_l3)
		return 1;

	ret = cpus_per_l3 / cpus_per_node;

	/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
	switch (ret) {
	case 1:
		break;
	case 2 ... 4:
	case 6:
		pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
		rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
		break;
	default:
		pr_warn("Ignore improbable SNC node count %d\n", ret);
		ret = 1;
		break;
	}

	return ret;
}

int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
	unsigned int threshold;

	snc_nodes_per_l3_cache = snc_get_config();

	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
	r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;

	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
		hw_res->mbm_width += mbm_offset;
	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
		pr_warn("Ignoring impossible MBM counter offset\n");

	/*
	 * A reasonable upper limit on the max threshold is the number
	 * of lines tagged per RMID if all RMIDs have the same number of
	 * lines tagged in the LLC.
	 *
	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
	 */
	threshold = resctrl_rmid_realloc_limit / r->num_rmid;

	/*
	 * Because num_rmid may not be a power of two, round the value
	 * to the nearest multiple of hw_res->mon_scale so it matches a
	 * value the hardware will measure. mon_scale may not be a power of 2.
	 */
	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);

	if (rdt_cpu_has(X86_FEATURE_BMEC)) {
		u32 eax, ebx, ecx, edx;

		/* Detect list of bandwidth sources that can be tracked */
		cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
		r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
	}

	r->mon_capable = true;

	return 0;
}

void __init intel_rdt_mbm_apply_quirk(void)
{
	int cf_index;

	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
		pr_info("No MBM correction factor available\n");
		return;
	}

	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
	mbm_cf = mbm_cf_table[cf_index].cf;
}