8 files changed, 2851 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
new file mode 100644
index 000000000000..d8a04b195da2
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_X86_CPU_RESCTRL)		+= core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL)		+= ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)	+= pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
+CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
new file mode 100644
index 000000000000..3792ab4819dc
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	"resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+#include "internal.h"
+
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * The cached resctrl_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
+
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
+
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
+	[RDT_RESOURCE_L3] =
+	{
+		.r_resctrl = {
+			.name			= "L3",
+			.ctrl_scope		= RESCTRL_L3_CACHE,
+			.mon_scope		= RESCTRL_L3_CACHE,
+			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L3),
+			.mon_domains		= mon_domain_init(RDT_RESOURCE_L3),
+			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
+		},
+		.msr_base		= MSR_IA32_L3_CBM_BASE,
+		.msr_update		= cat_wrmsr,
+	},
+	[RDT_RESOURCE_L2] =
+	{
+		.r_resctrl = {
+			.name			= "L2",
+			.ctrl_scope		= RESCTRL_L2_CACHE,
+			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_L2),
+			.schema_fmt		= RESCTRL_SCHEMA_BITMAP,
+		},
+		.msr_base		= MSR_IA32_L2_CBM_BASE,
+		.msr_update		= cat_wrmsr,
+	},
+	[RDT_RESOURCE_MBA] =
+	{
+		.r_resctrl = {
+			.name			= "MB",
+			.ctrl_scope		= RESCTRL_L3_CACHE,
+			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_MBA),
+			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
+		},
+	},
+	[RDT_RESOURCE_SMBA] =
+	{
+		.r_resctrl = {
+			.name			= "SMBA",
+			.ctrl_scope		= RESCTRL_L3_CACHE,
+			.ctrl_domains		= ctrl_domain_init(RDT_RESOURCE_SMBA),
+			.schema_fmt		= RESCTRL_SCHEMA_RANGE,
+		},
+	},
+};
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+	return r->mon.num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+	if (l >= RDT_NUM_RESOURCES)
+		return NULL;
+
+	return &rdt_resources_all[l].r_resctrl;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
+ *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
+ *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
+ *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cache mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_resource *r  = &hw_res->r_resctrl;
+	u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
+
+	if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
+		return;
+
+	rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
+
+	/* If all the bits were set in MSR, return success */
+	if (l3_cbm_0 != max_cbm)
+		return;
+
+	hw_res->num_closid = 4;
+	r->cache.cbm_len = 20;
+	r->cache.shareable_bits = 0xc0000;
+	r->cache.min_cbm_bits = 2;
+	r->cache.arch_has_sparse_bitmasks = false;
+	r->alloc_capable = true;
+
+	rdt_alloc_capable = true;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+	/*
+	 * There are no Intel SKUs as of now to support non-linear delay.
+	 */
+	pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+		boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+	return false;
+}
+
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	union cpuid_0x10_3_eax eax;
+	union cpuid_0x10_x_edx edx;
+	u32 ebx, ecx, max_delay;
+
+	cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+	hw_res->num_closid = edx.split.cos_max + 1;
+	max_delay = eax.split.max_delay + 1;
+	r->membw.max_bw = MAX_MBA_BW;
+	r->membw.arch_needs_linear = true;
+	if (ecx & MBA_IS_LINEAR) {
+		r->membw.delay_linear = true;
+		r->membw.min_bw = MAX_MBA_BW - max_delay;
+		r->membw.bw_gran = MAX_MBA_BW - max_delay;
+	} else {
+		if (!rdt_get_mb_table(r))
+			return false;
+		r->membw.arch_needs_linear = false;
+	}
+
+	if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+		r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+	else
+		r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+
+	r->alloc_capable = true;
+
+	return true;
+}
+
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	u32 eax, ebx, ecx, edx, subleaf;
+
+	/*
+	 * Query CPUID_Fn80000020_EDX_x01 for MBA and
+	 * CPUID_Fn80000020_EDX_x02 for SMBA
+	 */
+	subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 :  1;
+
+	cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+	hw_res->num_closid = edx + 1;
+	r->membw.max_bw = 1 << eax;
+
+	/* AMD does not use delay */
+	r->membw.delay_linear = false;
+	r->membw.arch_needs_linear = false;
+
+	/*
+	 * AMD does not use memory delay throttle model to control
+	 * the allocation like Intel does.
+	 */
+	r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+	r->membw.min_bw = 0;
+	r->membw.bw_gran = 1;
+
+	r->alloc_capable = true;
+
+	return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	union cpuid_0x10_1_eax eax;
+	union cpuid_0x10_x_ecx ecx;
+	union cpuid_0x10_x_edx edx;
+	u32 ebx, default_ctrl;
+
+	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
+	hw_res->num_closid = edx.split.cos_max + 1;
+	r->cache.cbm_len = eax.split.cbm_len + 1;
+	default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+	r->cache.shareable_bits = ebx & default_ctrl;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
+	r->alloc_capable = true;
+}
+
+static void rdt_get_cdp_config(int level)
+{
+	/*
+	 * By default, CDP is disabled. CDP can be enabled by mount parameter
+	 * "cdp" during resctrl file system mount time.
+	 */
+	rdt_resources_all[level].cdp_enabled = false;
+	rdt_resources_all[level].r_resctrl.cdp_capable = true;
+}
+
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+	r->cache.io_alloc_capable = true;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+	rdt_get_cdp_config(RDT_RESOURCE_L3);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+	rdt_get_cdp_config(RDT_RESOURCE_L2);
+}
+
+static void mba_wrmsr_amd(struct msr_param *m)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	unsigned int i;
+
+	for (i = m->low; i < m->high; i++)
+		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+	if (r->membw.delay_linear)
+		return MAX_MBA_BW - bw;
+
+	pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+	return MAX_MBA_BW;
+}
+
+static void mba_wrmsr_intel(struct msr_param *m)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	unsigned int i;
+
+	/*  Write the delay values for mba. */
+	for (i = m->low; i < m->high; i++)
+		wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
+}
+
+static void cat_wrmsr(struct msr_param *m)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+	unsigned int i;
+
+	for (i = m->low; i < m->high; i++)
+		wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->num_closid;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+	struct rdt_hw_resource *hw_res;
+	struct msr_param *m = arg;
+
+	hw_res = resctrl_to_arch_res(m->res);
+	hw_res->msr_update(m);
+}
+
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	int i;
+
+	/*
+	 * Initialize the Control MSRs to having no control.
+	 * For Cache Allocation: Set all bits in cbm
+	 * For Memory Allocation: Set b/w requested to 100%
+	 */
+	for (i = 0; i < hw_res->num_closid; i++, dc++)
+		*dc = resctrl_get_default_ctrl(r);
+}
+
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+	kfree(hw_dom->ctrl_val);
+	kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
+{
+	int idx;
+
+	for_each_mbm_idx(idx)
+		kfree(hw_dom->arch_mbm_states[idx]);
+	kfree(hw_dom);
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct msr_param m;
+	u32 *dc;
+
+	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+			   GFP_KERNEL);
+	if (!dc)
+		return -ENOMEM;
+
+	hw_dom->ctrl_val = dc;
+	setup_default_ctrlval(r, dc);
+
+	m.res = r;
+	m.dom = d;
+	m.low = 0;
+	m.high = hw_res->num_closid;
+	hw_res->msr_update(&m);
+	return 0;
+}
+
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid:	The size of the MBM counter array
+ * @hw_dom:	The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
+{
+	size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
+	enum resctrl_event_id eventid;
+	int idx;
+
+	for_each_mbm_event_id(eventid) {
+		if (!resctrl_is_mon_event_enabled(eventid))
+			continue;
+		idx = MBM_STATE_IDX(eventid);
+		hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
+		if (!hw_dom->arch_mbm_states[idx])
+			goto cleanup;
+	}
+
+	return 0;
+cleanup:
+	for_each_mbm_idx(idx) {
+		kfree(hw_dom->arch_mbm_states[idx]);
+		hw_dom->arch_mbm_states[idx] = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+	switch (scope) {
+	case RESCTRL_L2_CACHE:
+	case RESCTRL_L3_CACHE:
+		return get_cpu_cacheinfo_id(cpu, scope);
+	case RESCTRL_L3_NODE:
+		return cpu_to_node(cpu);
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+	struct rdt_hw_ctrl_domain *hw_dom;
+	struct list_head *add_pos = NULL;
+	struct rdt_domain_hdr *hdr;
+	struct rdt_ctrl_domain *d;
+	int err;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	if (id < 0) {
+		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+			     cpu, r->ctrl_scope, r->name);
+		return;
+	}
+
+	hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+	if (hdr) {
+		if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+			return;
+		d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+		if (r->cache.arch_has_per_cpu_cfg)
+			rdt_domain_reconfigure_cdp(r);
+		return;
+	}
+
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
+		return;
+
+	d = &hw_dom->d_resctrl;
+	d->hdr.id = id;
+	d->hdr.type = RESCTRL_CTRL_DOMAIN;
+	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+	rdt_domain_reconfigure_cdp(r);
+
+	if (domain_setup_ctrlval(r, d)) {
+		ctrl_domain_free(hw_dom);
+		return;
+	}
+
+	list_add_tail_rcu(&d->hdr.list, add_pos);
+
+	err = resctrl_online_ctrl_domain(r, d);
+	if (err) {
+		list_del_rcu(&d->hdr.list);
+		synchronize_rcu();
+		ctrl_domain_free(hw_dom);
+	}
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+	int id = get_domain_id_from_scope(cpu, r->mon_scope);
+	struct list_head *add_pos = NULL;
+	struct rdt_hw_mon_domain *hw_dom;
+	struct rdt_domain_hdr *hdr;
+	struct rdt_mon_domain *d;
+	struct cacheinfo *ci;
+	int err;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	if (id < 0) {
+		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+			     cpu, r->mon_scope, r->name);
+		return;
+	}
+
+	hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+	if (hdr) {
+		if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+			return;
+		d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+		cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+		/* Update the mbm_assign_mode state for the CPU if supported */
+		if (r->mon.mbm_cntr_assignable)
+			resctrl_arch_mbm_cntr_assign_set_one(r);
+		return;
+	}
+
+	hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+	if (!hw_dom)
+		return;
+
+	d = &hw_dom->d_resctrl;
+	d->hdr.id = id;
+	d->hdr.type = RESCTRL_MON_DOMAIN;
+	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+	if (!ci) {
+		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+		mon_domain_free(hw_dom);
+		return;
+	}
+	d->ci_id = ci->id;
+	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+	/* Update the mbm_assign_mode state for the CPU if supported */
+	if (r->mon.mbm_cntr_assignable)
+		resctrl_arch_mbm_cntr_assign_set_one(r);
+
+	arch_mon_domain_online(r, d);
+
+	if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
+		mon_domain_free(hw_dom);
+		return;
+	}
+
+	list_add_tail_rcu(&d->hdr.list, add_pos);
+
+	err = resctrl_online_mon_domain(r, d);
+	if (err) {
+		list_del_rcu(&d->hdr.list);
+		synchronize_rcu();
+		mon_domain_free(hw_dom);
+	}
+}
+
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+	if (r->alloc_capable)
+		domain_add_cpu_ctrl(cpu, r);
+	if (r->mon_capable)
+		domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+	int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+	struct rdt_hw_ctrl_domain *hw_dom;
+	struct rdt_domain_hdr *hdr;
+	struct rdt_ctrl_domain *d;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	if (id < 0) {
+		pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+			     cpu, r->ctrl_scope, r->name);
+		return;
+	}
+
+	hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+	if (!hdr) {
+		pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+			id, cpu, r->name);
+		return;
+	}
+
+	if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+		return;
+
+	d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+	hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+	if (cpumask_empty(&d->hdr.cpu_mask)) {
+		resctrl_offline_ctrl_domain(r, d);
+		list_del_rcu(&d->hdr.list);
+		synchronize_rcu();
+
+		/*
+		 * rdt_ctrl_domain "d" is going to be freed below, so clear
+		 * its pointer from pseudo_lock_region struct.
+		 */
+		if (d->plr)
+			d->plr->d = NULL;
+		ctrl_domain_free(hw_dom);
+
+		return;
+	}
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+	int id = get_domain_id_from_scope(cpu, r->mon_scope);
+	struct rdt_hw_mon_domain *hw_dom;
+	struct rdt_domain_hdr *hdr;
+	struct rdt_mon_domain *d;
+
+	lockdep_assert_held(&domain_list_lock);
+
+	if (id < 0) {
+		pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+			     cpu, r->mon_scope, r->name);
+		return;
+	}
+
+	hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+	if (!hdr) {
+		pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+			id, cpu, r->name);
+		return;
+	}
+
+	if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+		return;
+
+	d = container_of(hdr, struct rdt_mon_domain, hdr);
+	hw_dom = resctrl_to_arch_mon_dom(d);
+
+	cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+	if (cpumask_empty(&d->hdr.cpu_mask)) {
+		resctrl_offline_mon_domain(r, d);
+		list_del_rcu(&d->hdr.list);
+		synchronize_rcu();
+		mon_domain_free(hw_dom);
+
+		return;
+	}
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+	if (r->alloc_capable)
+		domain_remove_cpu_ctrl(cpu, r);
+	if (r->mon_capable)
+		domain_remove_cpu_mon(cpu, r);
+}
+
+static void clear_closid_rmid(int cpu)
+{
+	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+	state->default_closid = RESCTRL_RESERVED_CLOSID;
+	state->default_rmid = RESCTRL_RESERVED_RMID;
+	state->cur_closid = RESCTRL_RESERVED_CLOSID;
+	state->cur_rmid = RESCTRL_RESERVED_RMID;
+	wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+	      RESCTRL_RESERVED_CLOSID);
+}
+
+static int resctrl_arch_online_cpu(unsigned int cpu)
+{
+	struct rdt_resource *r;
+
+	mutex_lock(&domain_list_lock);
+	for_each_capable_rdt_resource(r)
+		domain_add_cpu(cpu, r);
+	mutex_unlock(&domain_list_lock);
+
+	clear_closid_rmid(cpu);
+	resctrl_online_cpu(cpu);
+
+	return 0;
+}
+
+static int resctrl_arch_offline_cpu(unsigned int cpu)
+{
+	struct rdt_resource *r;
+
+	resctrl_offline_cpu(cpu);
+
+	mutex_lock(&domain_list_lock);
+	for_each_capable_rdt_resource(r)
+		domain_remove_cpu(cpu, r);
+	mutex_unlock(&domain_list_lock);
+
+	clear_closid_rmid(cpu);
+
+	return 0;
+}
+
+enum {
+	RDT_FLAG_CMT,
+	RDT_FLAG_MBM_TOTAL,
+	RDT_FLAG_MBM_LOCAL,
+	RDT_FLAG_L3_CAT,
+	RDT_FLAG_L3_CDP,
+	RDT_FLAG_L2_CAT,
+	RDT_FLAG_L2_CDP,
+	RDT_FLAG_MBA,
+	RDT_FLAG_SMBA,
+	RDT_FLAG_BMEC,
+	RDT_FLAG_ABMC,
+	RDT_FLAG_SDCIAE,
+};
+
+#define RDT_OPT(idx, n, f)	\
+[idx] = {			\
+	.name = n,		\
+	.flag = f		\
+}
+
+struct rdt_options {
+	char	*name;
+	int	flag;
+	bool	force_off, force_on;
+};
+
+static struct rdt_options rdt_options[]  __ro_after_init = {
+	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
+	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
+	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
+	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
+	RDT_OPT(RDT_FLAG_L2_CDP,    "l2cdp",	X86_FEATURE_CDP_L2),
+	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
+	RDT_OPT(RDT_FLAG_SMBA,	    "smba",	X86_FEATURE_SMBA),
+	RDT_OPT(RDT_FLAG_BMEC,	    "bmec",	X86_FEATURE_BMEC),
+	RDT_OPT(RDT_FLAG_ABMC,	    "abmc",	X86_FEATURE_ABMC),
+	RDT_OPT(RDT_FLAG_SDCIAE,    "sdciae",	X86_FEATURE_SDCIAE),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+	struct rdt_options *o;
+	bool force_off;
+	char *tok;
+
+	if (*str == '=')
+		str++;
+	while ((tok = strsep(&str, ",")) != NULL) {
+		force_off = *tok == '!';
+		if (force_off)
+			tok++;
+		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+			if (strcmp(tok, o->name) == 0) {
+				if (force_off)
+					o->force_off = true;
+				else
+					o->force_on = true;
+				break;
+			}
+		}
+	}
+	return 1;
+}
+__setup("rdt", set_rdt_options);
+
+bool rdt_cpu_has(int flag)
+{
+	bool ret = boot_cpu_has(flag);
+	struct rdt_options *o;
+
+	if (!ret)
+		return ret;
+
+	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+		if (flag == o->flag) {
+			if (o->force_off)
+				ret = false;
+			if (o->force_on)
+				ret = true;
+			break;
+		}
+	}
+	return ret;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+	if (!rdt_cpu_has(X86_FEATURE_BMEC))
+		return false;
+
+	switch (evt) {
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+	default:
+		return false;
+	}
+}
+
+static __init bool get_mem_config(void)
+{
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
+	if (!rdt_cpu_has(X86_FEATURE_MBA))
+		return false;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		return __get_mem_config_intel(&hw_res->r_resctrl);
+	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+	return false;
+}
+
+static __init bool get_slow_mem_config(void)
+{
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+	if (!rdt_cpu_has(X86_FEATURE_SMBA))
+		return false;
+
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+	return false;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+	struct rdt_resource *r;
+	bool ret = false;
+
+	if (rdt_alloc_capable)
+		return true;
+
+	if (!boot_cpu_has(X86_FEATURE_RDT_A))
+		return false;
+
+	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+		r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+		rdt_get_cache_alloc_cfg(1, r);
+		if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+			rdt_get_cdp_l3_config();
+		if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+			rdt_set_io_alloc_capable(r);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+		/* CPUID 0x10.2 fields are same format at 0x10.1 */
+		r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+		rdt_get_cache_alloc_cfg(2, r);
+		if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+			rdt_get_cdp_l2_config();
+		ret = true;
+	}
+
+	if (get_mem_config())
+		ret = true;
+
+	if (get_slow_mem_config())
+		ret = true;
+
+	return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	bool ret = false;
+
+	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
+		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+		ret = true;
+	}
+	if (rdt_cpu_has(X86_FEATURE_ABMC))
+		ret = true;
+
+	if (!ret)
+		return false;
+
+	return !rdt_get_mon_l3_config(r);
+}
+
+static __init void __check_quirks_intel(void)
+{
+	switch (boot_cpu_data.x86_vfm) {
+	case INTEL_HASWELL_X:
+		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+			cache_alloc_hsw_probe();
+		break;
+	case INTEL_SKYLAKE_X:
+		if (boot_cpu_data.x86_stepping <= 4)
+			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+		else
+			set_rdt_options("!l3cat");
+		fallthrough;
+	case INTEL_BROADWELL_X:
+		intel_rdt_mbm_apply_quirk();
+		break;
+	}
+}
+
+static __init void check_quirks(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		__check_quirks_intel();
+}
+
+static __init bool get_rdt_resources(void)
+{
+	rdt_alloc_capable = get_rdt_alloc_resources();
+	rdt_mon_capable = get_rdt_mon_resources();
+
+	return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static __init void rdt_init_res_defs_intel(void)
+{
+	struct rdt_hw_resource *hw_res;
+	struct rdt_resource *r;
+
+	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
+		if (r->rid == RDT_RESOURCE_L3 ||
+		    r->rid == RDT_RESOURCE_L2) {
+			r->cache.arch_has_per_cpu_cfg = false;
+			r->cache.min_cbm_bits = 1;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
+			hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+			hw_res->msr_update = mba_wrmsr_intel;
+		}
+	}
+}
+
+static __init void rdt_init_res_defs_amd(void)
+{
+	struct rdt_hw_resource *hw_res;
+	struct rdt_resource *r;
+
+	for_each_rdt_resource(r) {
+		hw_res = resctrl_to_arch_res(r);
+
+		if (r->rid == RDT_RESOURCE_L3 ||
+		    r->rid == RDT_RESOURCE_L2) {
+			r->cache.arch_has_sparse_bitmasks = true;
+			r->cache.arch_has_per_cpu_cfg = true;
+			r->cache.min_cbm_bits = 0;
+		} else if (r->rid == RDT_RESOURCE_MBA) {
+			hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+			hw_res->msr_update = mba_wrmsr_amd;
+		} else if (r->rid == RDT_RESOURCE_SMBA) {
+			hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+			hw_res->msr_update = mba_wrmsr_amd;
+		}
+	}
+}
+
+static __init void rdt_init_res_defs(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		rdt_init_res_defs_intel();
+	else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		rdt_init_res_defs_amd();
+}
+
+static enum cpuhp_state rdt_online;
+
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+	if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
+		c->x86_cache_max_rmid  = -1;
+		c->x86_cache_occ_scale = -1;
+		c->x86_cache_mbm_width_offset = -1;
+		return;
+	}
+
+	/* will be overridden if occupancy monitoring exists */
+	c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+	if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+	    cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
+	    cpu_has(c, X86_FEATURE_ABMC)) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+		cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+		c->x86_cache_max_rmid  = ecx;
+		c->x86_cache_occ_scale = ebx;
+		c->x86_cache_mbm_width_offset = eax & 0xff;
+
+		if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+			c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+	}
+}
+
+static int __init resctrl_arch_late_init(void)
+{
+	struct rdt_resource *r;
+	int state, ret, i;
+
+	/* for_each_rdt_resource() requires all rid to be initialised. */
+	for (i = 0; i < RDT_NUM_RESOURCES; i++)
+		rdt_resources_all[i].r_resctrl.rid = i;
+
+	/*
+	 * Initialize functions(or definitions) that are different
+	 * between vendors here.
+	 */
+	rdt_init_res_defs();
+
+	check_quirks();
+
+	if (!get_rdt_resources())
+		return -ENODEV;
+
+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				  "x86/resctrl/cat:online:",
+				  resctrl_arch_online_cpu,
+				  resctrl_arch_offline_cpu);
+	if (state < 0)
+		return state;
+
+	ret = resctrl_init();
+	if (ret) {
+		cpuhp_remove_state(state);
+		return ret;
+	}
+	rdt_online = state;
+
+	for_each_alloc_capable_rdt_resource(r)
+		pr_info("%s allocation detected\n", r->name);
+
+	for_each_mon_capable_rdt_resource(r)
+		pr_info("%s monitoring detected\n", r->name);
+
+	return 0;
+}
+
+late_initcall(resctrl_arch_late_init);
+
+static void __exit resctrl_arch_exit(void)
+{
+	cpuhp_remove_state(rdt_online);
+
+	resctrl_exit();
+}
+
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
new file mode 100644
index 000000000000..b20e705606b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+
+#include "internal.h"
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	u32 idx = resctrl_get_config_index(closid, t);
+	struct msr_param msr_param;
+
+	if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
+		return -EINVAL;
+
+	hw_dom->ctrl_val[idx] = cfg_val;
+
+	msr_param.res = r;
+	msr_param.dom = d;
+	msr_param.low = idx;
+	msr_param.high = idx + 1;
+	hw_res->msr_update(&msr_param);
+
+	return 0;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+	struct resctrl_staged_config *cfg;
+	struct rdt_hw_ctrl_domain *hw_dom;
+	struct msr_param msr_param;
+	struct rdt_ctrl_domain *d;
+	enum resctrl_conf_type t;
+	u32 idx;
+
+	/* Walking r->domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+		hw_dom = resctrl_to_arch_ctrl_dom(d);
+		msr_param.res = NULL;
+		for (t = 0; t < CDP_NUM_TYPES; t++) {
+			cfg = &hw_dom->d_resctrl.staged_config[t];
+			if (!cfg->have_new_ctrl)
+				continue;
+
+			idx = resctrl_get_config_index(closid, t);
+			if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
+				continue;
+			hw_dom->ctrl_val[idx] = cfg->new_ctrl;
+
+			if (!msr_param.res) {
+				msr_param.low = idx;
+				msr_param.high = msr_param.low + 1;
+				msr_param.res = r;
+				msr_param.dom = d;
+			} else {
+				msr_param.low = min(msr_param.low, idx);
+				msr_param.high = max(msr_param.high, idx + 1);
+			}
+		}
+		if (msr_param.res)
+			smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+	}
+
+	return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+			    u32 closid, enum resctrl_conf_type type)
+{
+	struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+	u32 idx = resctrl_get_config_index(closid, type);
+
+	return hw_dom->ctrl_val[idx];
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+	bool *enable = arg;
+
+	if (*enable)
+		msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+	else
+		msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_ctrl_domain *d;
+
+	/* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	/* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+		on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (hw_res->r_resctrl.cache.io_alloc_capable &&
+	    hw_res->sdciae_enabled != enable) {
+		_resctrl_sdciae_enable(r, enable);
+		hw_res->sdciae_enabled = enable;
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
new file mode 100644
index 000000000000..4a916c84a322
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RESCTRL_INTERNAL_H
+#define _ASM_X86_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+
+#define L3_QOS_CDP_ENABLE		0x01ULL
+
+#define L2_QOS_CDP_ENABLE		0x01ULL
+
+#define MBM_CNTR_WIDTH_BASE		24
+
+#define MBA_IS_LINEAR			0x4
+
+#define MBM_CNTR_WIDTH_OFFSET_AMD	20
+
+#define RMID_VAL_ERROR			BIT_ULL(63)
+
+#define RMID_VAL_UNAVAIL		BIT_ULL(62)
+
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+
+/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ *			   return value.
+ * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr:	Value of IA32_QM_CTR last time it was read for the RMID used to
+ *		find this struct.
+ */
+struct arch_mbm_state {
+	u64	chunks;
+	u64	prev_msr;
+};
+
+/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */
+#define ABMC_ENABLE_BIT			0
+
+/*
+ * Qos Event Identifiers.
+ */
+#define ABMC_EXTENDED_EVT_ID		BIT(31)
+#define ABMC_EVT_ID			BIT(0)
+
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT		1
+
+/**
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ *			       a resource for a control function
+ * @d_resctrl:	Properties exposed to the resctrl file system
+ * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+	struct rdt_ctrl_domain		d_resctrl;
+	u32				*ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ *			      a resource for a monitor function
+ * @d_resctrl:	Properties exposed to the resctrl file system
+ * @arch_mbm_states:	Per-event pointer to the MBM event's saved state.
+ *			An MBM event's state is an array of struct arch_mbm_state
+ *			indexed by RMID on x86.
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_mon_domain {
+	struct rdt_mon_domain		d_resctrl;
+	struct arch_mbm_state		*arch_mbm_states[QOS_NUM_L3_MBM_EVENTS];
+};
+
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+	return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
+{
+	return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
+}
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res:       The resource to use
+ * @dom:       The domain to update
+ * @low:       Beginning index from base MSR
+ * @high:      End index
+ */
+struct msr_param {
+	struct rdt_resource	*res;
+	struct rdt_ctrl_domain	*dom;
+	u32			low;
+	u32			high;
+};
+
+/**
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl:		Attributes of the resource used directly by resctrl.
+ * @num_closid:		Maximum number of closid this hardware can support,
+ *			regardless of CDP. This is exposed via
+ *			resctrl_arch_get_num_closid() to avoid confusion
+ *			with struct resctrl_schema's property of the same name,
+ *			which has been corrected for features like CDP.
+ * @msr_base:		Base MSR address for CBMs
+ * @msr_update:		Function pointer to update QOS MSRs
+ * @mon_scale:		cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width:		Monitor width, to detect and correct for overflow.
+ * @cdp_enabled:	CDP state of this resource
+ * @mbm_cntr_assign_enabled:	ABMC feature is enabled
+ * @sdciae_enabled:	SDCIAE feature (backing "io_alloc") is enabled.
+ *
+ * Members of this structure are either private to the architecture
+ * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
+ * msr_update and msr_base.
+ */
+struct rdt_hw_resource {
+	struct rdt_resource	r_resctrl;
+	u32			num_closid;
+	unsigned int		msr_base;
+	void			(*msr_update)(struct msr_param *m);
+	unsigned int		mon_scale;
+	unsigned int		mbm_width;
+	bool			cdp_enabled;
+	bool			mbm_cntr_assign_enabled;
+	bool			sdciae_enabled;
+};
+
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+	return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+extern struct rdt_hw_resource rdt_resources_all[];
+
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+	struct {
+		unsigned int cbm_len:5;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+	struct {
+		unsigned int max_delay:12;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+	struct {
+		unsigned int reserved:3;
+		unsigned int noncont:1;
+	} split;
+	unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+	struct {
+		unsigned int cos_max:16;
+	} split;
+	unsigned int full;
+};
+
+/*
+ * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG.
+ *
+ * @bw_type		: Event configuration that represents the memory
+ *			  transactions being tracked by the @cntr_id.
+ * @bw_src		: Bandwidth source (RMID or CLOSID).
+ * @reserved1		: Reserved.
+ * @is_clos		: @bw_src field is a CLOSID (not an RMID).
+ * @cntr_id		: Counter identifier.
+ * @reserved		: Reserved.
+ * @cntr_en		: Counting enable bit.
+ * @cfg_en		: Configuration enable bit.
+ *
+ * Configuration and counting:
+ * Counter can be configured across multiple writes to MSR. Configuration
+ * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the
+ * configuration is applied.
+ * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not
+ *                             count events.
+ * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start
+ *                             counting events.
+ */
+union l3_qos_abmc_cfg {
+	struct {
+		unsigned long bw_type  :32,
+			      bw_src   :12,
+			      reserved1: 3,
+			      is_clos  : 1,
+			      cntr_id  : 5,
+			      reserved : 9,
+			      cntr_en  : 1,
+			      cfg_en   : 1;
+	} split;
+	unsigned long full;
+};
+
+void rdt_ctrl_update(void *arg);
+
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+
+bool rdt_cpu_has(int flag);
+
+void __init intel_rdt_mbm_apply_quirk(void);
+
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r);
+
+#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
new file mode 100644
index 000000000000..dffcc8307500
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -0,0 +1,583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	"resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+#define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
+
+static int snc_nodes_per_l3_cache = 1;
+
+/*
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ *    for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ *    equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ *    to calculate corrected value by shifting:
+ *    corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+	u32 rmidthreshold;
+	u64 cf;
+} mbm_cf_table[] __initconst = {
+	{7,	CF(1.000000)},
+	{15,	CF(1.000000)},
+	{15,	CF(0.969650)},
+	{31,	CF(1.000000)},
+	{31,	CF(1.066667)},
+	{31,	CF(0.969650)},
+	{47,	CF(1.142857)},
+	{63,	CF(1.000000)},
+	{63,	CF(1.185115)},
+	{63,	CF(1.066553)},
+	{79,	CF(1.454545)},
+	{95,	CF(1.000000)},
+	{95,	CF(1.230769)},
+	{95,	CF(1.142857)},
+	{95,	CF(1.066667)},
+	{127,	CF(1.000000)},
+	{127,	CF(1.254863)},
+	{127,	CF(1.185255)},
+	{151,	CF(1.000000)},
+	{127,	CF(1.066667)},
+	{167,	CF(1.000000)},
+	{159,	CF(1.454334)},
+	{183,	CF(1.000000)},
+	{127,	CF(0.969744)},
+	{191,	CF(1.280246)},
+	{191,	CF(1.230921)},
+	{215,	CF(1.000000)},
+	{191,	CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+	/* Correct MBM value. */
+	if (rmid > mbm_cf_rmidthreshold)
+		val = (val * mbm_cf) >> 20;
+
+	return val;
+}
+
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache.  So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+	if (snc_nodes_per_l3_cache == 1)
+		return lrmid;
+
+	return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
+{
+	u64 msr_val;
+
+	/*
+	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+	 * with a valid event code for supported resource type and the bits
+	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+	 * are error bits.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
+	rdmsrq(MSR_IA32_QM_CTR, msr_val);
+
+	if (msr_val & RMID_VAL_ERROR)
+		return -EIO;
+	if (msr_val & RMID_VAL_UNAVAIL)
+		return -EINVAL;
+
+	*val = msr_val;
+	return 0;
+}
+
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
+						 u32 rmid,
+						 enum resctrl_event_id eventid)
+{
+	struct arch_mbm_state *state;
+
+	if (!resctrl_is_mbm_event(eventid))
+		return NULL;
+
+	state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
+
+	return state ? &state[rmid] : NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 unused, u32 rmid,
+			     enum resctrl_event_id eventid)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	int cpu = cpumask_any(&d->hdr.cpu_mask);
+	struct arch_mbm_state *am;
+	u32 prmid;
+
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am) {
+		memset(am, 0, sizeof(*am));
+
+		prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+		/* Record any initial, non-zero count value. */
+		__rmid_read_phys(prmid, eventid, &am->prev_msr);
+	}
+}
+
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	enum resctrl_event_id eventid;
+	int idx;
+
+	for_each_mbm_event_id(eventid) {
+		if (!resctrl_is_mon_event_enabled(eventid))
+			continue;
+		idx = MBM_STATE_IDX(eventid);
+		memset(hw_dom->arch_mbm_states[idx], 0,
+		       sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
+	}
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+	u64 shift = 64 - width, chunks;
+
+	chunks = (cur_msr << shift) - (prev_msr << shift);
+	return chunks >> shift;
+}
+
+static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct arch_mbm_state *am;
+	u64 chunks;
+
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am) {
+		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+						 hw_res->mbm_width);
+		chunks = get_corrected_mbm_count(rmid, am->chunks);
+		am->prev_msr = msr_val;
+	} else {
+		chunks = msr_val;
+	}
+
+	return chunks * hw_res->mon_scale;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+			   u32 unused, u32 rmid, enum resctrl_event_id eventid,
+			   u64 *val, void *ignored)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	int cpu = cpumask_any(&d->hdr.cpu_mask);
+	struct arch_mbm_state *am;
+	u64 msr_val;
+	u32 prmid;
+	int ret;
+
+	resctrl_arch_rmid_read_context_check();
+
+	prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+	ret = __rmid_read_phys(prmid, eventid, &msr_val);
+
+	if (!ret) {
+		*val = get_corrected_val(r, d, rmid, eventid, msr_val);
+	} else if (ret == -EINVAL) {
+		am = get_arch_mbm_state(hw_dom, rmid, eventid);
+		if (am)
+			am->prev_msr = 0;
+	}
+
+	return ret;
+}
+
+static int __cntr_id_read(u32 cntr_id, u64 *val)
+{
+	u64 msr_val;
+
+	/*
+	 * QM_EVTSEL Register definition:
+	 * =======================================================
+	 * Bits    Mnemonic        Description
+	 * =======================================================
+	 * 63:44   --              Reserved
+	 * 43:32   RMID            RMID or counter ID in ABMC mode
+	 *                         when reading an MBM event
+	 * 31      ExtendedEvtID   Extended Event Identifier
+	 * 30:8    --              Reserved
+	 * 7:0     EvtID           Event Identifier
+	 * =======================================================
+	 * The contents of a specific counter can be read by setting the
+	 * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
+	 * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
+	 * to the desired counter ID. Reading the QM_CTR then returns the
+	 * contents of the specified counter. The RMID_VAL_ERROR bit is set
+	 * if the counter configuration is invalid, or if an invalid counter
+	 * ID is set in the QM_EVTSEL.RMID field.  The RMID_VAL_UNAVAIL bit
+	 * is set if the counter data is unavailable.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
+	rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+	if (msr_val & RMID_VAL_ERROR)
+		return -EIO;
+	if (msr_val & RMID_VAL_UNAVAIL)
+		return -EINVAL;
+
+	*val = msr_val;
+	return 0;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			     u32 unused, u32 rmid, int cntr_id,
+			     enum resctrl_event_id eventid)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	struct arch_mbm_state *am;
+
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am) {
+		memset(am, 0, sizeof(*am));
+
+		/* Record any initial, non-zero count value. */
+		__cntr_id_read(cntr_id, &am->prev_msr);
+	}
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+			   u32 unused, u32 rmid, int cntr_id,
+			   enum resctrl_event_id eventid, u64 *val)
+{
+	u64 msr_val;
+	int ret;
+
+	ret = __cntr_id_read(cntr_id, &msr_val);
+	if (ret)
+		return ret;
+
+	*val = get_corrected_val(r, d, rmid, eventid, msr_val);
+
+	return 0;
+}
+
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+	if (snc_nodes_per_l3_cache > 1)
+		msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+	X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+	X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+	X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+	X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+	X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+	X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
+	{}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+	struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+	const cpumask_t *node0_cpumask;
+	int cpus_per_node, cpus_per_l3;
+	int ret;
+
+	if (!x86_match_cpu(snc_cpu_ids) || !ci)
+		return 1;
+
+	cpus_read_lock();
+	if (num_online_cpus() != num_present_cpus())
+		pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+	cpus_read_unlock();
+
+	node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+	cpus_per_node = cpumask_weight(node0_cpumask);
+	cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+	if (!cpus_per_node || !cpus_per_l3)
+		return 1;
+
+	ret = cpus_per_l3 / cpus_per_node;
+
+	/* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+	switch (ret) {
+	case 1:
+		break;
+	case 2 ... 4:
+	case 6:
+		pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+		rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+		break;
+	default:
+		pr_warn("Ignore improbable SNC node count %d\n", ret);
+		ret = 1;
+		break;
+	}
+
+	return ret;
+}
+
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	unsigned int threshold;
+	u32 eax, ebx, ecx, edx;
+
+	snc_nodes_per_l3_cache = snc_get_config();
+
+	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+	r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
+	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+	if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+		hw_res->mbm_width += mbm_offset;
+	else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+		pr_warn("Ignoring impossible MBM counter offset\n");
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
+
+	/*
+	 * Because num_rmid may not be a power of two, round the value
+	 * to the nearest multiple of hw_res->mon_scale so it matches a
+	 * value the hardware will measure. mon_scale may not be a power of 2.
+	 */
+	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
+
+	if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
+		/* Detect list of bandwidth sources that can be tracked */
+		cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+		r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+	}
+
+	/*
+	 * resctrl assumes a system that supports assignable counters can
+	 * switch to "default" mode. Ensure that there is a "default" mode
+	 * to switch to. This enforces a dependency between the independent
+	 * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
+	 * hardware features.
+	 */
+	if (rdt_cpu_has(X86_FEATURE_ABMC) &&
+	    (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
+	     rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
+		r->mon.mbm_cntr_assignable = true;
+		cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
+		r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
+		hw_res->mbm_cntr_assign_enabled = true;
+	}
+
+	r->mon_capable = true;
+
+	return 0;
+}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+	int cf_index;
+
+	cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+	if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+		pr_info("No MBM correction factor available\n");
+		return;
+	}
+
+	mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+	mbm_cf = mbm_cf_table[cf_index].cf;
+}
+
+static void resctrl_abmc_set_one_amd(void *arg)
+{
+	bool *enable = arg;
+
+	if (*enable)
+		msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+	else
+		msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+}
+
+/*
+ * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
+ * associated with all monitor domains.
+ */
+static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
+{
+	struct rdt_mon_domain *d;
+
+	lockdep_assert_cpus_held();
+
+	list_for_each_entry(d, &r->mon_domains, hdr.list) {
+		on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
+				 &enable, 1);
+		resctrl_arch_reset_rmid_all(r, d);
+	}
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (r->mon.mbm_cntr_assignable &&
+	    hw_res->mbm_cntr_assign_enabled != enable) {
+		_resctrl_abmc_enable(r, enable);
+		hw_res->mbm_cntr_assign_enabled = enable;
+	}
+
+	return 0;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+	return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
+}
+
+static void resctrl_abmc_config_one_amd(void *info)
+{
+	union l3_qos_abmc_cfg *abmc_cfg = info;
+
+	wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+}
+
+/*
+ * Send an IPI to the domain to assign the counter to RMID, event pair.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+			      enum resctrl_event_id evtid, u32 rmid, u32 closid,
+			      u32 cntr_id, bool assign)
+{
+	struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+	union l3_qos_abmc_cfg abmc_cfg = { 0 };
+	struct arch_mbm_state *am;
+
+	abmc_cfg.split.cfg_en = 1;
+	abmc_cfg.split.cntr_en = assign ? 1 : 0;
+	abmc_cfg.split.cntr_id = cntr_id;
+	abmc_cfg.split.bw_src = rmid;
+	if (assign)
+		abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
+
+	smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
+
+	/*
+	 * The hardware counter is reset (because cfg_en == 1) so there is no
+	 * need to record initial non-zero counts.
+	 */
+	am = get_arch_mbm_state(hw_dom, rmid, evtid);
+	if (am)
+		memset(am, 0, sizeof(*am));
+}
+
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
new file mode 100644
index 000000000000..de580eca3363
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheflush.h>
+#include <linux/cpu.h>
+#include <linux/perf_event.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "pseudo_lock_trace.h"
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/**
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ *                                          platforms
+ * @void: It takes no parameters.
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * resctrl_arch_measure_l*_residency()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+u64 resctrl_arch_get_prefetch_disable_bits(void)
+{
+	prefetch_disable_bits = 0;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return 0;
+
+	switch (boot_cpu_data.x86_vfm) {
+	case INTEL_BROADWELL_X:
+		/*
+		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+		 * as:
+		 * 0    L2 Hardware Prefetcher Disable (R/W)
+		 * 1    L2 Adjacent Cache Line Prefetcher Disable (R/W)
+		 * 2    DCU Hardware Prefetcher Disable (R/W)
+		 * 3    DCU IP Prefetcher Disable (R/W)
+		 * 63:4 Reserved
+		 */
+		prefetch_disable_bits = 0xF;
+		break;
+	case INTEL_ATOM_GOLDMONT:
+	case INTEL_ATOM_GOLDMONT_PLUS:
+		/*
+		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+		 * as:
+		 * 0     L2 Hardware Prefetcher Disable (R/W)
+		 * 1     Reserved
+		 * 2     DCU Hardware Prefetcher Disable (R/W)
+		 * 63:3  Reserved
+		 */
+		prefetch_disable_bits = 0x5;
+		break;
+	}
+
+	return prefetch_disable_bits;
+}
+
+/**
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_pseudo_lock_fn(void *_plr)
+{
+	struct pseudo_lock_region *plr = _plr;
+	u32 rmid_p, closid_p;
+	unsigned long i;
+	u64 saved_msr;
+#ifdef CONFIG_KASAN
+	/*
+	 * The registers used for local register variables are also used
+	 * when KASAN is active. When KASAN is active we use a regular
+	 * variable to ensure we always use a valid pointer, but the cost
+	 * is that this variable will enter the cache through evicting the
+	 * memory we are trying to lock into the cache. Thus expect lower
+	 * pseudo-locking success rate when KASAN is active.
+	 */
+	unsigned int line_size;
+	unsigned int size;
+	void *mem_r;
+#else
+	register unsigned int line_size asm("esi");
+	register unsigned int size asm("edi");
+	register void *mem_r asm(_ASM_BX);
+#endif /* CONFIG_KASAN */
+
+	/*
+	 * Make sure none of the allocated memory is cached. If it is we
+	 * will get a cache hit in below loop from outside of pseudo-locked
+	 * region.
+	 * wbinvd (as opposed to clflush/clflushopt) is required to
+	 * increase likelihood that allocated cache portion will be filled
+	 * with associated memory.
+	 */
+	wbinvd();
+
+	/*
+	 * Always called with interrupts enabled. By disabling interrupts
+	 * ensure that we will not be preempted during this critical section.
+	 */
+	local_irq_disable();
+
+	/*
+	 * Call wrmsr and rdmsr as directly as possible to avoid tracing
+	 * clobbering local register variables or affecting cache accesses.
+	 *
+	 * Disable the hardware prefetcher so that when the end of the memory
+	 * being pseudo-locked is reached the hardware will not read beyond
+	 * the buffer and evict pseudo-locked memory read earlier from the
+	 * cache.
+	 */
+	saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+	native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+	closid_p = this_cpu_read(pqr_state.cur_closid);
+	rmid_p = this_cpu_read(pqr_state.cur_rmid);
+	mem_r = plr->kmem;
+	size = plr->size;
+	line_size = plr->line_size;
+	/*
+	 * Critical section begin: start by writing the closid associated
+	 * with the capacity bitmask of the cache region being
+	 * pseudo-locked followed by reading of kernel memory to load it
+	 * into the cache.
+	 */
+	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
+	/*
+	 * Cache was flushed earlier. Now access kernel memory to read it
+	 * into cache region associated with just activated plr->closid.
+	 * Loop over data twice:
+	 * - In first loop the cache region is shared with the page walker
+	 *   as it populates the paging structure caches (including TLB).
+	 * - In the second loop the paging structure caches are used and
+	 *   cache region is populated with the memory being referenced.
+	 */
+	for (i = 0; i < size; i += PAGE_SIZE) {
+		/*
+		 * Add a barrier to prevent speculative execution of this
+		 * loop reading beyond the end of the buffer.
+		 */
+		rmb();
+		asm volatile("mov (%0,%1,1), %%eax\n\t"
+			:
+			: "r" (mem_r), "r" (i)
+			: "%eax", "memory");
+	}
+	for (i = 0; i < size; i += line_size) {
+		/*
+		 * Add a barrier to prevent speculative execution of this
+		 * loop reading beyond the end of the buffer.
+		 */
+		rmb();
+		asm volatile("mov (%0,%1,1), %%eax\n\t"
+			:
+			: "r" (mem_r), "r" (i)
+			: "%eax", "memory");
+	}
+	/*
+	 * Critical section end: restore closid with capacity bitmask that
+	 * does not overlap with pseudo-locked region.
+	 */
+	native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
+
+	/* Re-enable the hardware prefetcher(s) */
+	wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
+	local_irq_enable();
+
+	plr->thread_done = 1;
+	wake_up_interruptible(&plr->lock_thread_wq);
+	return 0;
+}
+
+/**
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ *                                      pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
+{
+	struct pseudo_lock_region *plr = _plr;
+	u32 saved_low, saved_high;
+	unsigned long i;
+	u64 start, end;
+	void *mem_r;
+
+	local_irq_disable();
+	/*
+	 * Disable hardware prefetchers.
+	 */
+	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+	mem_r = READ_ONCE(plr->kmem);
+	/*
+	 * Dummy execute of the time measurement to load the needed
+	 * instructions into the L1 instruction cache.
+	 */
+	start = rdtsc_ordered();
+	for (i = 0; i < plr->size; i += 32) {
+		start = rdtsc_ordered();
+		asm volatile("mov (%0,%1,1), %%eax\n\t"
+			     :
+			     : "r" (mem_r), "r" (i)
+			     : "%eax", "memory");
+		end = rdtsc_ordered();
+		trace_pseudo_lock_mem_latency((u32)(end - start));
+	}
+	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+	local_irq_enable();
+	plr->thread_done = 1;
+	wake_up_interruptible(&plr->lock_thread_wq);
+	return 0;
+}
+
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+	.type		= PERF_TYPE_RAW,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 0,
+	.exclude_user	= 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+	.type		= PERF_TYPE_RAW,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 0,
+	.exclude_user	= 1,
+};
+
+struct residency_counts {
+	u64 miss_before, hits_before;
+	u64 miss_after,  hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+				struct perf_event_attr *hit_attr,
+				struct pseudo_lock_region *plr,
+				struct residency_counts *counts)
+{
+	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+	struct perf_event *miss_event, *hit_event;
+	int hit_pmcnum, miss_pmcnum;
+	u32 saved_low, saved_high;
+	unsigned int line_size;
+	unsigned int size;
+	unsigned long i;
+	void *mem_r;
+	u64 tmp;
+
+	miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+						      NULL, NULL, NULL);
+	if (IS_ERR(miss_event))
+		goto out;
+
+	hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+						     NULL, NULL, NULL);
+	if (IS_ERR(hit_event))
+		goto out_miss;
+
+	local_irq_disable();
+	/*
+	 * Check any possible error state of events used by performing
+	 * one local read.
+	 */
+	if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+		local_irq_enable();
+		goto out_hit;
+	}
+	if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+		local_irq_enable();
+		goto out_hit;
+	}
+
+	/*
+	 * Disable hardware prefetchers.
+	 */
+	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+	wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+
+	/* Initialize rest of local variables */
+	/*
+	 * Performance event has been validated right before this with
+	 * interrupts disabled - it is thus safe to read the counter index.
+	 */
+	miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+	hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+	line_size = READ_ONCE(plr->line_size);
+	mem_r = READ_ONCE(plr->kmem);
+	size = READ_ONCE(plr->size);
+
+	/*
+	 * Read counter variables twice - first to load the instructions
+	 * used in L1 cache, second to capture accurate value that does not
+	 * include cache misses incurred because of instruction loads.
+	 */
+	hits_before = rdpmc(hit_pmcnum);
+	miss_before = rdpmc(miss_pmcnum);
+	/*
+	 * From SDM: Performing back-to-back fast reads are not guaranteed
+	 * to be monotonic.
+	 * Use LFENCE to ensure all previous instructions are retired
+	 * before proceeding.
+	 */
+	rmb();
+	hits_before = rdpmc(hit_pmcnum);
+	miss_before = rdpmc(miss_pmcnum);
+	/*
+	 * Use LFENCE to ensure all previous instructions are retired
+	 * before proceeding.
+	 */
+	rmb();
+	for (i = 0; i < size; i += line_size) {
+		/*
+		 * Add a barrier to prevent speculative execution of this
+		 * loop reading beyond the end of the buffer.
+		 */
+		rmb();
+		asm volatile("mov (%0,%1,1), %%eax\n\t"
+			     :
+			     : "r" (mem_r), "r" (i)
+			     : "%eax", "memory");
+	}
+	/*
+	 * Use LFENCE to ensure all previous instructions are retired
+	 * before proceeding.
+	 */
+	rmb();
+	hits_after = rdpmc(hit_pmcnum);
+	miss_after = rdpmc(miss_pmcnum);
+	/*
+	 * Use LFENCE to ensure all previous instructions are retired
+	 * before proceeding.
+	 */
+	rmb();
+	/* Re-enable hardware prefetchers */
+	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+	local_irq_enable();
+out_hit:
+	perf_event_release_kernel(hit_event);
+out_miss:
+	perf_event_release_kernel(miss_event);
+out:
+	/*
+	 * All counts will be zero on failure.
+	 */
+	counts->miss_before = miss_before;
+	counts->hits_before = hits_before;
+	counts->miss_after  = miss_after;
+	counts->hits_after  = hits_after;
+	return 0;
+}
+
+int resctrl_arch_measure_l2_residency(void *_plr)
+{
+	struct pseudo_lock_region *plr = _plr;
+	struct residency_counts counts = {0};
+
+	/*
+	 * Non-architectural event for the Goldmont Microarchitecture
+	 * from Intel x86 Architecture Software Developer Manual (SDM):
+	 * MEM_LOAD_UOPS_RETIRED D1H (event number)
+	 * Umask values:
+	 *     L2_HIT   02H
+	 *     L2_MISS  10H
+	 */
+	switch (boot_cpu_data.x86_vfm) {
+	case INTEL_ATOM_GOLDMONT:
+	case INTEL_ATOM_GOLDMONT_PLUS:
+		perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+						   .umask = 0x10);
+		perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+						  .umask = 0x2);
+		break;
+	default:
+		goto out;
+	}
+
+	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+	/*
+	 * If a failure prevented the measurements from succeeding
+	 * tracepoints will still be written and all counts will be zero.
+	 */
+	trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+			     counts.miss_after - counts.miss_before);
+out:
+	plr->thread_done = 1;
+	wake_up_interruptible(&plr->lock_thread_wq);
+	return 0;
+}
+
+int resctrl_arch_measure_l3_residency(void *_plr)
+{
+	struct pseudo_lock_region *plr = _plr;
+	struct residency_counts counts = {0};
+
+	/*
+	 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+	 * has two "no fix" errata associated with it: BDM35 and BDM100. On
+	 * this platform the following events are used instead:
+	 * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+	 *       REFERENCE 4FH
+	 *       MISS      41H
+	 */
+
+	switch (boot_cpu_data.x86_vfm) {
+	case INTEL_BROADWELL_X:
+		/* On BDW the hit event counts references, not hits */
+		perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+						  .umask = 0x4f);
+		perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+						   .umask = 0x41);
+		break;
+	default:
+		goto out;
+	}
+
+	measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+	/*
+	 * If a failure prevented the measurements from succeeding
+	 * tracepoints will still be written and all counts will be zero.
+	 */
+
+	counts.miss_after -= counts.miss_before;
+	if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
+		/*
+		 * On BDW references and misses are counted, need to adjust.
+		 * Sometimes the "hits" counter is a bit more than the
+		 * references, for example, x references but x + 1 hits.
+		 * To not report invalid hit values in this case we treat
+		 * that as misses equal to references.
+		 */
+		/* First compute the number of cache references measured */
+		counts.hits_after -= counts.hits_before;
+		/* Next convert references to cache hits */
+		counts.hits_after -= min(counts.miss_after, counts.hits_after);
+	} else {
+		counts.hits_after -= counts.hits_before;
+	}
+
+	trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
+out:
+	plr->thread_done = 1;
+	wake_up_interruptible(&plr->lock_thread_wq);
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
new file mode 100644
index 000000000000..7c8aef08010f
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+	    TP_PROTO(u32 latency),
+	    TP_ARGS(latency),
+	    TP_STRUCT__entry(__field(u32, latency)),
+	    TP_fast_assign(__entry->latency = latency),
+	    TP_printk("latency=%u", __entry->latency)
+	   );
+
+TRACE_EVENT(pseudo_lock_l2,
+	    TP_PROTO(u64 l2_hits, u64 l2_miss),
+	    TP_ARGS(l2_hits, l2_miss),
+	    TP_STRUCT__entry(__field(u64, l2_hits)
+			     __field(u64, l2_miss)),
+	    TP_fast_assign(__entry->l2_hits = l2_hits;
+			   __entry->l2_miss = l2_miss;),
+	    TP_printk("hits=%llu miss=%llu",
+		      __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+	    TP_PROTO(u64 l3_hits, u64 l3_miss),
+	    TP_ARGS(l3_hits, l3_miss),
+	    TP_STRUCT__entry(__field(u64, l3_hits)
+			     __field(u64, l3_miss)),
+	    TP_fast_assign(__entry->l3_hits = l3_hits;
+			   __entry->l3_miss = l3_miss;),
+	    TP_printk("hits=%llu miss=%llu",
+		      __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
new file mode 100644
index 000000000000..885026468440
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/msr.h>
+#include "internal.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/*
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is protected against __switch_to() because
+ * preemption is disabled.
+ */
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+	struct resctrl_cpu_defaults *r = info;
+
+	if (r) {
+		this_cpu_write(pqr_state.default_closid, r->closid);
+		this_cpu_write(pqr_state.default_rmid, r->rmid);
+	}
+
+	/*
+	 * We cannot unconditionally write the MSR because the current
+	 * executing task might have its own closid selected. Just reuse
+	 * the context switch code.
+	 */
+	resctrl_arch_sched_in(current);
+}
+
+#define INVALID_CONFIG_INDEX   UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ *                              configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ *         1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ *         INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+	switch (evtid) {
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		return 0;
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		return 1;
+	default:
+		/* Should never reach here */
+		return INVALID_CONFIG_INDEX;
+	}
+}
+
+void resctrl_arch_mon_event_config_read(void *_config_info)
+{
+	struct resctrl_mon_config_info *config_info = _config_info;
+	unsigned int index;
+	u64 msrval;
+
+	index = mon_event_config_index_get(config_info->evtid);
+	if (index == INVALID_CONFIG_INDEX) {
+		pr_warn_once("Invalid event id %d\n", config_info->evtid);
+		return;
+	}
+	rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+	/* Report only the valid event configuration bits */
+	config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+void resctrl_arch_mon_event_config_write(void *_config_info)
+{
+	struct resctrl_mon_config_info *config_info = _config_info;
+	unsigned int index;
+
+	index = mon_event_config_index_get(config_info->evtid);
+	if (index == INVALID_CONFIG_INDEX) {
+		pr_warn_once("Invalid event id %d\n", config_info->evtid);
+		return;
+	}
+	wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+	bool *enable = arg;
+
+	wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+	bool *enable = arg;
+
+	wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+	void (*update)(void *arg);
+	struct rdt_ctrl_domain *d;
+	struct rdt_resource *r_l;
+	cpumask_var_t cpu_mask;
+	int cpu;
+
+	/* Walking r->domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	if (level == RDT_RESOURCE_L3)
+		update = l3_qos_cfg_update;
+	else if (level == RDT_RESOURCE_L2)
+		update = l2_qos_cfg_update;
+	else
+		return -EINVAL;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	r_l = &rdt_resources_all[level].r_resctrl;
+	list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
+		if (r_l->cache.arch_has_per_cpu_cfg)
+			/* Pick all the CPUs in the domain instance */
+			for_each_cpu(cpu, &d->hdr.cpu_mask)
+				cpumask_set_cpu(cpu, cpu_mask);
+		else
+			/* Pick one CPU from each domain instance to update MSR */
+			cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
+	}
+
+	/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+	on_each_cpu_mask(cpu_mask, update, &enable, 1);
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+	if (!r->cdp_capable)
+		return;
+
+	if (r->rid == RDT_RESOURCE_L2)
+		l2_qos_cfg_update(&hw_res->cdp_enabled);
+
+	if (r->rid == RDT_RESOURCE_L3)
+		l3_qos_cfg_update(&hw_res->cdp_enabled);
+}
+
+static int cdp_enable(int level)
+{
+	struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+	int ret;
+
+	if (!r_l->alloc_capable)
+		return -EINVAL;
+
+	ret = set_cache_qos_cfg(level, true);
+	if (!ret)
+		rdt_resources_all[level].cdp_enabled = true;
+
+	return ret;
+}
+
+static void cdp_disable(int level)
+{
+	struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
+
+	if (r_hw->cdp_enabled) {
+		set_cache_qos_cfg(level, false);
+		r_hw->cdp_enabled = false;
+	}
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
+{
+	struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
+
+	if (!hw_res->r_resctrl.cdp_capable)
+		return -EINVAL;
+
+	if (enable)
+		return cdp_enable(l);
+
+	cdp_disable(l);
+
+	return 0;
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+	return rdt_resources_all[l].cdp_enabled;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_ctrl_domain *hw_dom;
+	struct msr_param msr_param;
+	struct rdt_ctrl_domain *d;
+	int i;
+
+	/* Walking r->domains, ensure it can't race with cpuhp */
+	lockdep_assert_cpus_held();
+
+	msr_param.res = r;
+	msr_param.low = 0;
+	msr_param.high = hw_res->num_closid;
+
+	/*
+	 * Disable resource control for this resource by setting all
+	 * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
+	 * from each domain to update the MSRs below.
+	 */
+	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+		hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+		for (i = 0; i < hw_res->num_closid; i++)
+			hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+		msr_param.dom = d;
+		smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+	}
+
+	return;
+}