summaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile73
-rw-r--r--arch/x86/kernel/cpu/acrn.c81
-rw-r--r--arch/x86/kernel/cpu/amd.c1376
-rw-r--r--arch/x86/kernel/cpu/amd_cache_disable.c301
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c552
-rw-r--r--arch/x86/kernel/cpu/bhyve.c66
-rw-r--r--arch/x86/kernel/cpu/bugs.c3750
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c433
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c820
-rw-r--r--arch/x86/kernel/cpu/centaur.c365
-rw-r--r--arch/x86/kernel/cpu/common.c2615
-rw-r--r--arch/x86/kernel/cpu/cpu.h80
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c192
-rw-r--r--arch/x86/kernel/cpu/cpuid_0x2_table.c128
-rw-r--r--arch/x86/kernel/cpu/cyrix.c81
-rw-r--r--arch/x86/kernel/cpu/debugfs.c101
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c215
-rw-r--r--arch/x86/kernel/cpu/hygon.c279
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c84
-rw-r--r--arch/x86/kernel/cpu/intel.c829
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1248
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c244
-rw-r--r--arch/x86/kernel/cpu/match.c97
-rw-r--r--arch/x86/kernel/cpu/mce/Makefile14
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c1270
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c265
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2970
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c365
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c156
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c805
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c537
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h352
-rw-r--r--arch/x86/kernel/cpu/mce/p5.c (renamed from arch/x86/kernel/cpu/mcheck/p5.c)35
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c489
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c163
-rw-r--r--arch/x86/kernel/cpu/mce/winchip.c (renamed from arch/x86/kernel/cpu/mcheck/winchip.c)22
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c150
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c256
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h63
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c283
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2485
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c791
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c353
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c584
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c41
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile5
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1306
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c556
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c926
-rw-r--r--arch/x86/kernel/cpu/microcode/intel-ucode-defs.h160
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c1016
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h136
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh68
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c770
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c22
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c159
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c56
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c792
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c130
-rw-r--r--arch/x86/kernel/cpu/mtrr/legacy.c94
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c (renamed from arch/x86/kernel/cpu/mtrr/main.c)415
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h54
-rw-r--r--arch/x86/kernel/cpu/perf_event.c2132
-rw-r--r--arch/x86/kernel/cpu/perf_event.h717
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c729
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.c502
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_iommu.h40
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c547
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c2349
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c1034
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c761
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c3658
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h681
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c319
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c1344
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c261
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c24
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c106
-rw-r--r--arch/x86/kernel/cpu/rdrand.c79
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile7
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c1079
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c133
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h225
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c583
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c517
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h45
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c262
-rw-r--r--arch/x86/kernel/cpu/scattered.c93
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile6
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c201
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h28
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c1326
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h129
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h241
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c1244
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c1072
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h110
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c454
-rw-r--r--arch/x86/kernel/cpu/topology.c608
-rw-r--r--arch/x86/kernel/cpu/topology.h67
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c227
-rw-r--r--arch/x86/kernel/cpu/topology_common.c258
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c145
-rw-r--r--arch/x86/kernel/cpu/transmeta.c24
-rw-r--r--arch/x86/kernel/cpu/tsx.c267
-rw-r--r--arch/x86/kernel/cpu/umc.c8
-rw-r--r--arch/x86/kernel/cpu/umwait.c246
-rw-r--r--arch/x86/kernel/cpu/vmware.c588
-rw-r--r--arch/x86/kernel/cpu/vortex.c39
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c116
117 files changed, 34185 insertions, 25542 deletions
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
index 667df55a4399..0bca7ef7426a 100644
--- a/arch/x86/kernel/cpu/.gitignore
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..2f8a58ef690e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
#
# Makefile for x86-compatible CPU details, features and quirks
#
@@ -8,50 +9,68 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
-# Make sure load_percpu_segment has no stackprotector
-nostackp := $(call cc-option, -fno-stack-protector)
-CFLAGS_common.o := $(nostackp)
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+KMSAN_SANITIZE_common.o := n
-obj-y := intel_cacheinfo.o scattered.o topology.o
-obj-y += proc.o capflags.o powerflags.o common.o
+# As above, instrumenting secondary CPU boot code causes boot hangs.
+KCSAN_SANITIZE_common.o := n
+
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology_ext.o topology_amd.o
+obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-y += bugs.o
+obj-y += aperfmperf.o
+obj-y += cpuid-deps.o cpuid_0x2_table.o
+obj-y += umwait.o
+obj-y += capflags.o powerflags.o
+
+obj-$(CONFIG_X86_LOCAL_APIC) += topology.o
-obj-$(CONFIG_X86_32) += bugs.o
-obj-$(CONFIG_X86_64) += bugs_64.o
+obj-$(CONFIG_PROC_FS) += proc.o
-obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
+ifdef CONFIG_CPU_SUP_INTEL
+obj-y += intel.o tsx.o
+obj-$(CONFIG_PM) += intel_epb.o
+endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
+ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy)
+obj-y += amd_cache_disable.o
+endif
+obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
+obj-$(CONFIG_CPU_SUP_VORTEX_32) += vortex.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
-endif
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
-endif
-
-
-obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_MICROCODE) += microcode/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_SGX) += sgx/
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+obj-$(CONFIG_BHYVE_GUEST) += bhyve.o
+obj-$(CONFIG_ACRN_GUEST) += acrn.o
+
+obj-$(CONFIG_DEBUG_FS) += debugfs.o
+
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-targets += capflags.c
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
+targets += capflags.c
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
new file mode 100644
index 000000000000..2c5b51aad91a
--- /dev/null
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ACRN detection support
+ *
+ * Copyright (C) 2019 Intel Corporation. All rights reserved.
+ *
+ * Jason Chen CJ <jason.cj.chen@intel.com>
+ * Zhao Yakui <yakui.zhao@intel.com>
+ *
+ */
+
+#include <linux/interrupt.h>
+
+#include <asm/acrn.h>
+#include <asm/apic.h>
+#include <asm/cpufeatures.h>
+#include <asm/desc.h>
+#include <asm/hypervisor.h>
+#include <asm/idtentry.h>
+#include <asm/irq_regs.h>
+
+static u32 __init acrn_detect(void)
+{
+ return acrn_cpuid_base();
+}
+
+static void __init acrn_init_platform(void)
+{
+ /* Install system interrupt handler for ACRN hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
+
+ x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+ x86_platform.calibrate_cpu = acrn_get_tsc_khz;
+}
+
+static bool acrn_x2apic_available(void)
+{
+ return boot_cpu_has(X86_FEATURE_X2APIC);
+}
+
+static void (*acrn_intr_handler)(void);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /*
+ * The hypervisor requires that the APIC EOI should be acked.
+ * If the APIC EOI is not acked, the APIC ISR bit for the
+ * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it
+ * will block the interrupt whose vector is lower than
+ * HYPERVISOR_CALLBACK_VECTOR.
+ */
+ apic_eoi();
+ inc_irq_stat(irq_hv_callback_count);
+
+ if (acrn_intr_handler)
+ acrn_intr_handler();
+
+ set_irq_regs(old_regs);
+}
+
+void acrn_setup_intr_handler(void (*handler)(void))
+{
+ acrn_intr_handler = handler;
+}
+EXPORT_SYMBOL_GPL(acrn_setup_intr_handler);
+
+void acrn_remove_intr_handler(void)
+{
+ acrn_intr_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(acrn_remove_intr_handler);
+
+const __initconst struct hypervisor_x86 x86_hyper_acrn = {
+ .name = "ACRN",
+ .detect = acrn_detect,
+ .type = X86_HYPER_ACRN,
+ .init.init_platform = acrn_init_platform,
+ .init.x2apic_available = acrn_x2apic_available,
+};
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c587a8757227..bc94ff1e250a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,24 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
-#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
-
+#include <linux/kvm_types.h>
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/random.h>
+#include <linux/topology.h>
+#include <linux/platform_data/x86/amd-fch.h>
#include <asm/processor.h>
#include <asm/apic.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/spec-ctrl.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
#include <asm/pci-direct.h>
+#include <asm/delay.h>
+#include <asm/debugreg.h>
+#include <asm/resctrl.h>
+#include <asm/msr.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
-# include <asm/cacheflush.h>
#endif
#include "cpu.h"
-static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
+u16 invlpgb_count_max __ro_after_init = 1;
+
+static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
{
u32 gprs[8] = { 0 };
int err;
@@ -36,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
return err;
}
-static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
+static inline int wrmsrq_amd_safe(unsigned msr, u64 val)
{
u32 gprs[8] = { 0 };
@@ -51,7 +66,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
return wrmsr_safe_regs(gprs);
}
-#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -66,14 +80,21 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
* performance at the same time..
*/
-extern void vide(void);
-__asm__(".align 4\nvide: ret");
+#ifdef CONFIG_X86_32
+extern __visible void vide(void);
+__asm__(".text\n"
+ ".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
+#endif
-static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
+static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
@@ -84,11 +105,12 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
-static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
+static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
@@ -101,13 +123,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
return;
}
- if (c->x86_model == 6 && c->x86_mask == 1) {
+ if (c->x86_model == 6 && c->x86_stepping == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
- unsigned long d, d2;
+ u64 d, d2;
- printk(KERN_INFO "AMD K6 stepping B detected - ");
+ pr_info("AMD K6 stepping B detected - ");
/*
* It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -116,22 +138,22 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
- rdtscl(d);
+ OPTIMIZER_HIDE_VAR(f_vide);
+ d = rdtsc();
while (n--)
f_vide();
- rdtscl(d2);
+ d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)
- printk(KERN_CONT
- "system stability may be impaired when more than 32 MB are used.\n");
+ pr_cont("system stability may be impaired when more than 32 MB are used.\n");
else
- printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+ pr_cont("probably OK (after B9730xxxx).\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
- (c->x86_model == 8 && c->x86_mask < 8)) {
+ (c->x86_model == 8 && c->x86_stepping < 8)) {
/* We can only write allocate on the low 508Mb */
if (mbytes > 508)
mbytes = 508;
@@ -144,13 +166,13 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+ pr_info("Enabling old style K6 write allocation for %d Mb\n",
mbytes);
}
return;
}
- if ((c->x86_model == 8 && c->x86_mask > 7) ||
+ if ((c->x86_model == 8 && c->x86_stepping > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
@@ -165,7 +187,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
local_irq_restore(flags);
- printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+ pr_info("Enabling new style K6 write allocation for %d Mb\n",
mbytes);
}
@@ -177,10 +199,41 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ pr_info("Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_stepping >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -190,12 +243,12 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* but they are not certified as MP capable.
*/
/* Athlon 660/661 is valid. */
- if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
- (c->x86_mask == 1)))
+ if ((c->x86_model == 6) && ((c->x86_stepping == 0) ||
+ (c->x86_stepping == 1)))
return;
/* Duron 670 is valid */
- if ((c->x86_model == 7) && (c->x86_mask == 0))
+ if ((c->x86_model == 7) && (c->x86_stepping == 0))
return;
/*
@@ -205,10 +258,10 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
* more.
*/
- if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
- ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ if (((c->x86_model == 6) && (c->x86_stepping >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_stepping >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
+ if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -219,55 +272,16 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
-}
-
-static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
#endif
+}
#ifdef CONFIG_NUMA
/*
* To workaround broken NUMA config. Read the comment in
* srat_detect_node().
*/
-static int __cpuinit nearby_node(int apicid)
+static int nearby_node(int apicid)
{
int i, node;
@@ -285,100 +299,16 @@ static int __cpuinit nearby_node(int apicid)
}
#endif
-/*
- * Fixup core topology information for
- * (1) AMD multi-node processors
- * Assumption: Number of cores in each internal node is the same.
- * (2) AMD processors supporting compute units
- */
-#ifdef CONFIG_X86_HT
-static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
-{
- u32 nodes, cores_per_cu = 1;
- u8 node_id;
- int cpu = smp_processor_id();
-
- /* get information required for multi-node processors */
- if (cpu_has_topoext) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- nodes = ((ecx >> 8) & 7) + 1;
- node_id = ecx & 7;
-
- /* get compute unit information */
- smp_num_siblings = ((ebx >> 8) & 3) + 1;
- c->compute_unit_id = ebx & 0xff;
- cores_per_cu += ((ebx >> 8) & 3);
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes = ((value >> 3) & 7) + 1;
- node_id = value & 7;
- } else
- return;
-
- /* fixup multi-node processor information */
- if (nodes > 1) {
- u32 cores_per_node;
- u32 cus_per_node;
-
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- cores_per_node = c->x86_max_cores / nodes;
- cus_per_node = cores_per_node / cores_per_cu;
-
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = node_id;
-
- /* core id has to be in the [0 .. cores_per_node - 1] range */
- c->cpu_core_id %= cores_per_node;
- c->compute_unit_id %= cus_per_node;
- }
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_HT
- unsigned bits;
- int cpu = smp_processor_id();
-
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
- /* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
- amd_get_topology(c);
-#endif
-}
-
-u16 amd_get_nb_id(int cpu)
-{
- u16 id = 0;
-#ifdef CONFIG_SMP
- id = per_cpu(cpu_llc_id, cpu);
-#endif
- return id;
-}
-EXPORT_SYMBOL_GPL(amd_get_nb_id);
-
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
int cpu = smp_processor_id();
int node;
- unsigned apicid = c->apicid;
+ unsigned apicid = c->topo.apicid;
node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
- node = per_cpu(cpu_llc_id, cpu);
+ node = per_cpu_llc_id(cpu);
/*
* On multi-fabric platform (e.g. Numascale NumaChip) a
@@ -408,10 +338,9 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
* through CPU mapping may alter the outcome, directly
* access __apicid_to_node[].
*/
- int ht_nodeid = c->initial_apicid;
+ int ht_nodeid = c->topo.initial_apicid;
- if (ht_nodeid >= 0 &&
- __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
@@ -421,33 +350,75 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
- ecx = cpuid_ecx(0x80000008);
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
+ }
+#endif
+}
- c->x86_max_cores = (ecx & 0xff) + 1;
+#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \
+ step, step, ucode)
+
+static const struct x86_cpu_id amd_tsa_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008),
+ ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216),
+ {},
+};
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
+static void tsa_init(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
+ if (cpu_has(c, X86_FEATURE_ZEN3) ||
+ cpu_has(c, X86_FEATURE_ZEN4)) {
+ if (x86_match_min_microcode_rev(amd_tsa_microcode))
+ setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR);
+ else
+ pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO);
+ setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO);
}
-
- c->x86_coreid_bits = bits;
-#endif
}
-static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
+static void bsp_init_amd(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -455,10 +426,9 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
(c->x86 == 0x10 && c->x86_model >= 0x2)) {
u64 val;
- rdmsrl(MSR_K7_HWCR, val);
+ rdmsrq(MSR_K7_HWCR, val);
if (!(val & BIT(24)))
- printk(KERN_WARNING FW_BUG "TSC doesn't count "
- "with P0 frequency!\n");
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
}
}
@@ -472,12 +442,182 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_u32() & va_align.mask;
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
+ c->x86 >= 0x15 && c->x86 <= 0x17) {
+ unsigned int bit;
+
+ switch (c->x86) {
+ case 0x15: bit = 54; break;
+ case 0x16: bit = 33; break;
+ case 0x17: bit = 10; break;
+ default: return;
+ }
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x60 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ case 0x50 ... 0x5f:
+ case 0x80 ... 0xaf:
+ case 0xc0 ... 0xcf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN6);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ bsp_determine_snp(c);
+ tsa_init(c);
+
+ if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID))
+ setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT);
+
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
+}
+
+static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ /*
+ * Mark using WBINVD is needed during kexec on processors that
+ * support SME. This provides support for performing a successful
+ * kexec when going from SME inactive to SME active (or vice-versa).
+ *
+ * The cache must be cleared so that if there are entries with the
+ * same physical address, both with and without the encryption bit,
+ * they don't race each other when flushed and potentially end up
+ * with the wrong entry being committed to memory.
+ *
+ * Test the CPUID bit directly because with mem_encrypt=off the
+ * BSP will clear the X86_FEATURE_SME bit and the APs will not
+ * see it set after that.
+ */
+ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ __this_cpu_write(cache_state_incoherent, true);
+
+ /*
+ * BIOS support is required for SME and SEV.
+ * For SME: If BIOS has enabled SME then adjust x86_phys_bits by
+ * the SME physical address space reduction value.
+ * If BIOS has not enabled SME then don't advertise the
+ * SME feature (set in scattered.c).
+ * If the kernel has not enabled SME via any means then
+ * don't advertise the SME feature.
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
+ *
+ * In all cases, since support for SME and SEV requires long mode,
+ * don't advertise the feature under CONFIG_X86_32.
+ */
+ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) {
+ /* Check if memory encryption is enabled */
+ rdmsrq(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ goto clear_all;
+
+ /*
+ * Always adjust physical address bits. Even though this
+ * will be a value above 32-bits this is still done for
+ * CONFIG_X86_32 so that accurate values are reported.
+ */
+ c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ goto clear_all;
+
+ if (!sme_me_mask)
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+
+ rdmsrq(MSR_K7_HWCR, msr);
+ if (!(msr & MSR_K7_HWCR_SMMLOCK))
+ goto clear_sev;
+
+ return;
+
+clear_all:
+ setup_clear_cpu_cap(X86_FEATURE_SME);
+clear_sev:
+ setup_clear_cpu_cap(X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
}
}
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void early_init_amd(struct cpuinfo_x86 *c)
{
- early_init_amd_mc(c);
+ u32 dummy;
+
+ if (c->x86 >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/*
* c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
@@ -486,38 +626,94 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (!check_tsc_unstable())
- sched_clock_stable = 1;
}
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
/* Set MTRR capability flag if appropriate */
if (c->x86 == 5)
if (c->x86_model == 13 || c->x86_model == 9 ||
- (c->x86_model == 8 && c->x86_mask >= 8))
+ (c->x86_model == 8 && c->x86_stepping >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
- /* check CPU config space for extended APIC ID */
- if (cpu_has_apic && c->x86 >= 0xf) {
- unsigned int val;
- val = read_pci_config(0, 24, 0, 0x68);
- if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ /*
+ * ApicID can always be treated as an 8-bit value for AMD APIC versions
+ * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+ * after 16h.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ if (c->x86 > 0x16)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ else if (c->x86 >= 0xf) {
+ /* check CPU config space for extended APIC ID */
+ unsigned int val;
+
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val >> 17 & 0x3) == 0x3)
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
}
#endif
-}
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(const int *erratum);
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
+
+ early_detect_mem_encrypt(c);
+
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
+ if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) {
+ setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
+ setup_force_cpu_cap(X86_FEATURE_SBPB);
+ }
+ }
+}
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
- u32 dummy;
- unsigned long long value;
+ u32 level;
+ u64 value;
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
+ */
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrq_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrq_amd_safe(0xc001100d, value);
+ }
+ }
+
+ if (!c->x86_model_id[0])
+ strscpy(c->x86_model_id, "Hammer");
#ifdef CONFIG_SMP
/*
@@ -527,166 +723,426 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
-
- early_init_amd(c);
+ set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
*/
- clear_cpu_cap(c, 0*32+31);
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MMCONF_FAM10H
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ fam10h_check_enable_mmcfg();
+#endif
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
- value &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, value);
- }
- }
+ /*
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+ */
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- }
- if (c->x86 >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
*/
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
+}
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
- }
+static void init_amd_ln(struct cpuinfo_x86 *c)
+{
+ /*
+ * Apply erratum 665 fix unconditionally so machines without a BIOS
+ * fix work.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG, 31);
+}
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+static bool rdrand_force;
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+static int __init rdrand_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
+ if (!strcmp(str, "force"))
+ rdrand_force = true;
+ else
+ return -EINVAL;
- /* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (!rdmsrl_safe(0xc0011005, &value)) {
- value |= 1ULL << 54;
- wrmsrl_safe(0xc0011005, value);
- rdmsrl(0xc0011005, value);
- if (value & (1ULL << 54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- printk(KERN_INFO FW_INFO "CPU: Re-enabling "
- "disabled Topology Extensions Support\n");
- }
- }
+ return 0;
+}
+early_param("rdrand", rdrand_cmdline);
+
+static void clear_rdrand_cpuid_bit(struct cpuinfo_x86 *c)
+{
+ /*
+ * Saving of the MSR used to hide the RDRAND support during
+ * suspend/resume is done by arch/x86/power/cpu.c, which is
+ * dependent on CONFIG_PM_SLEEP.
+ */
+ if (!IS_ENABLED(CONFIG_PM_SLEEP))
+ return;
+
+ /*
+ * The self-test can clear X86_FEATURE_RDRAND, so check for
+ * RDRAND support using the CPUID function directly.
+ */
+ if (!(cpuid_ecx(1) & BIT(30)) || rdrand_force)
+ return;
+
+ msr_clear_bit(MSR_AMD64_CPUID_FN_1, 62);
+
+ /*
+ * Verify that the CPUID change has occurred in case the kernel is
+ * running virtualized and the hypervisor doesn't support the MSR.
+ */
+ if (cpuid_ecx(1) & BIT(30)) {
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, but hypervisor does not support hiding RDRAND via CPUID.\n");
+ return;
}
+ clear_cpu_cap(c, X86_FEATURE_RDRAND);
+ pr_info_once("BIOS may not properly restore RDRAND after suspend, hiding RDRAND via CPUID. Use rdrand=force to reenable.\n");
+}
+
+static void init_amd_jg(struct cpuinfo_x86 *c)
+{
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
+
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
+
/*
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-
- if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) {
value |= 0x1E;
- wrmsrl_safe(0xc0011021, value);
+ wrmsrq_safe(MSR_F15H_IC_CFG, value);
}
}
- cpu_detect_cache_sizes(c);
+ /*
+ * Some BIOS implementations do not restore proper RDRAND support
+ * across suspend and resume. Check on whether to hide the RDRAND
+ * instruction support via CPUID.
+ */
+ clear_rdrand_cpuid_bit(c);
+}
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008) {
- amd_detect_cmp(c);
- srat_detect_node(c);
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+ {}
+};
+
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
+ */
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
+
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
+void init_spectral_chicken(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
+ u64 value;
+
+ /*
+ * On Zen2 we offer this chicken (bit) on the altar of Speculation.
+ *
+ * This suppresses speculation from the middle of a basic block, i.e. it
+ * suppresses non-branch predictions.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
+ value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
+ wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
+ }
}
+#endif
+}
-#ifdef CONFIG_X86_32
- detect_ht(c);
+static void init_amd_zen_common(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
+#ifdef CONFIG_NUMA
+ node_reclaim_distance = 32;
#endif
+}
- init_amd_cacheinfo(c);
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ fix_erratum_1386(c);
- if (c->x86 >= 0xf)
- set_cpu_cap(c, X86_FEATURE_K8);
+ /* Fix up CPUID bits, but only if not virtualised. */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
- if (cpu_has_xmm2) {
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+ /* Erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
+ set_cpu_cap(c, X86_FEATURE_CPB);
}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
- fam10h_check_enable_mmcfg();
+ /*
+ * Turn off the Instructions Retired free counter on machines that are
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (c->x86_model < 0x30) {
+ msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
+ clear_cpu_cap(c, X86_FEATURE_IRPERF);
}
+}
- if (c == &boot_cpu_data && c->x86 >= 0xf) {
- unsigned long long tseg;
+static bool cpu_has_zenbleed_microcode(void)
+{
+ u32 good_rev = 0;
+
+ switch (boot_cpu_data.x86_model) {
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
+
+ default:
+ return false;
+ }
+
+ if (boot_cpu_data.microcode < good_rev)
+ return false;
+
+ return true;
+}
+
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_AVX))
+ return;
+ if (!cpu_has_zenbleed_microcode()) {
+ pr_notice_once("Zenbleed: please update your microcode for the most optimal fix\n");
+ msr_set_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ } else {
+ msr_clear_bit(MSR_AMD64_DE_CFG, MSR_AMD64_DE_CFG_ZEN2_FP_BACKUP_FIX_BIT);
+ }
+}
+
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+
+ /* Disable RDSEED on AMD Cyan Skillfish because of an error. */
+ if (c->x86_model == 0x47 && c->x86_stepping == 0x0) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg("RDSEED is not reliable on this platform; disabling.\n");
+ }
+
+ /* Correct misconfigured CPUID on some clients. */
+ clear_cpu_cap(c, X86_FEATURE_INVLPGB);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
/*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
*/
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
+}
+
+static const struct x86_cpu_id zen5_rdseed_microcode[] = {
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038),
+ ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037),
+ {},
+};
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+ if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) {
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18);
+ pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n");
+ }
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* AMD FSRM also implies FSRS */
+ if (cpu_has(c, X86_FEATURE_FSRM))
+ set_cpu_cap(c, X86_FEATURE_FSRS);
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x12: init_amd_ln(c); break;
+ case 0x15: init_amd_bd(c); break;
+ case 0x16: init_amd_jg(c); break;
+ }
+
+ /*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
+ /*
+ * Enable workaround for FXSAVE leak on CPUs
+ * without a XSaveErPtr feature
+ */
+ if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR)))
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_amd_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
}
}
-#endif
+
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
/*
* Family 0x12 and above processors have APIC timer
@@ -695,77 +1151,58 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (c->x86 == 0x10) {
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors themself. If
- * it doesn't do it here as suggested by the BKDG.
- *
- * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
- */
- u64 mask;
- int err;
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
- err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
- if (err == 0) {
- mask |= (1 << 10);
- wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
- }
+ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
- /*
- * On family 10h BIOS may not have properly enabled WC+ support,
- * causing it to be converted to CD memtype. This may result in
- * performance degradation for certain nested-paging guests.
- * Prevent this conversion by clearing bit 24 in
- * MSR_AMD64_BU_CFG2.
- *
- * NOTE: we want to use the _safe accessors so as not to #GP kvm
- * guests on older kvm hosts.
- */
+ /* Enable the Instructions Retired free counter */
+ if (cpu_has(c, X86_FEATURE_IRPERF))
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
- rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
- value &= ~(1ULL << 24);
- wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
+ check_null_seg_clears_base(c);
- if (cpu_has_amd_erratum(amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- }
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
- if (cpu_has_amd_erratum(amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
- rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+ /* Enable Translation Cache Extension */
+ if (cpu_has(c, X86_FEATURE_TCE))
+ msr_set_bit(MSR_EFER, _EFER_TCE);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
- unsigned int size)
+static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
- if ((c->x86 == 6)) {
+ if (c->x86 == 6) {
/* Duron Rev A0 */
- if (c->x86_model == 3 && c->x86_mask == 0)
+ if (c->x86_model == 3 && c->x86_stepping == 0)
size = 64;
/* Tbird rev A1/A2 */
if (c->x86_model == 4 &&
- (c->x86_mask == 0 || c->x86_mask == 1))
+ (c->x86_stepping == 0 || c->x86_stepping == 1))
size = 256;
}
return size;
}
#endif
-static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
- tlb_flushall_shift = 5;
-
- if (c->x86 <= 0x11)
- tlb_flushall_shift = 4;
-}
-
-static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
u16 mask = 0xfff;
@@ -778,8 +1215,8 @@ static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
- tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
- tlb_lli_4k[ENTRIES] = ebx & mask;
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
/*
* K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
@@ -791,41 +1228,39 @@ static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
}
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
- if (!((eax >> 16) & mask)) {
- u32 a, b, c, d;
-
- cpuid(0x80000005, &a, &b, &c, &d);
- tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
- } else {
- tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
- }
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
/* a 4M entry uses two 2M entries */
- tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+ tlb_lld_4m = tlb_lld_2m >> 1;
/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
if (!(eax & mask)) {
/* Erratum 658 */
if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
- tlb_lli_2m[ENTRIES] = 1024;
+ tlb_lli_2m = 1024;
} else {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
- tlb_lli_2m[ENTRIES] = eax & 0xff;
+ tlb_lli_2m = eax & 0xff;
}
} else
- tlb_lli_2m[ENTRIES] = eax & mask;
+ tlb_lli_2m = eax & mask;
- tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+ tlb_lli_4m = tlb_lli_2m >> 1;
- cpu_set_tlb_flushall_shift(c);
+ /* Max number of pages INVLPGB can invalidate in one shot */
+ if (cpu_has(c, X86_FEATURE_INVLPGB))
+ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1;
}
-static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
+static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
@@ -836,7 +1271,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
}
},
},
- .c_size_cache = amd_size_cache,
+ .legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
.c_detect_tlb = cpu_detect_tlb_amd,
@@ -847,76 +1282,125 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
cpu_dev_register(amd_cpu_dev);
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
+
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
+{
+ int cpu = smp_processor_id();
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return;
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
-static bool cpu_has_amd_erratum(const int *erratum)
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsrq(amd_msr_dr_addr_masks[dr], mask);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
+
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
{
- struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
- /*
- * If called early enough that current_cpu_data hasn't been initialized
- * yet, fall back to boot_cpu_data.
- */
- if (cpu->x86 == 0)
- cpu = &boot_cpu_data;
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
- if (cpu->x86_vendor != X86_VENDOR_AMD)
- return false;
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
+}
+EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask);
+
+static void zenbleed_check_cpu(void *unused)
+{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
+ zen2_zenbleed_check(c);
+}
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
+void amd_check_microcode(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return;
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
+}
+
+static const char * const s5_reset_reason_txt[] = {
+ [0] = "thermal pin BP_THERMTRIP_L was tripped",
+ [1] = "power button was pressed for 4 seconds",
+ [2] = "shutdown pin was tripped",
+ [4] = "remote ASF power off command was received",
+ [9] = "internal CPU thermal limit was tripped",
+ [16] = "system reset pin BP_SYS_RST_L was tripped",
+ [17] = "software issued PCI reset",
+ [18] = "software wrote 0x4 to reset control register 0xCF9",
+ [19] = "software wrote 0x6 to reset control register 0xCF9",
+ [20] = "software wrote 0xE to reset control register 0xCF9",
+ [21] = "ACPI power state transition occurred",
+ [22] = "keyboard reset pin KB_RST_L was tripped",
+ [23] = "internal CPU shutdown event occurred",
+ [24] = "system failed to boot before failed boot timer expired",
+ [25] = "hardware watchdog timer expired",
+ [26] = "remote ASF reset command was received",
+ [27] = "an uncorrected error caused a data fabric sync flood event",
+ [29] = "FCH and MP1 failed warm reset handshake",
+ [30] = "a parity error occurred",
+ [31] = "a software sync flood event occurred",
+};
+
+static __init int print_s5_reset_status_mmio(void)
+{
+ void __iomem *addr;
+ u32 value;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value));
+ if (!addr)
+ return 0;
+
+ value = ioread32(addr);
+
+ /* Value with "all bits set" is an error response and should be ignored. */
+ if (value == U32_MAX) {
+ iounmap(addr);
+ return 0;
}
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_mask;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
+ /*
+ * Clear all reason bits so they won't be retained if the next reset
+ * does not update the register. Besides, some bits are never cleared by
+ * hardware so it's software's responsibility to clear them.
+ *
+ * Writing the value back effectively clears all reason bits as they are
+ * write-1-to-clear.
+ */
+ iowrite32(value, addr);
+ iounmap(addr);
+
+ for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) {
+ if (!(value & BIT(i)))
+ continue;
+
+ if (s5_reset_reason_txt[i]) {
+ pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n",
+ value, s5_reset_reason_txt[i]);
+ }
+ }
- return false;
+ return 0;
}
+late_initcall(print_s5_reset_status_mmio);
diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c
new file mode 100644
index 000000000000..8843b9557aea
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_cache_disable.c
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD L3 cache_disable_{0,1} sysfs handling
+ * Documentation/ABI/testing/sysfs-devices-system-cpu
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/capability.h>
+#include <linux/pci.h>
+#include <linux/sysfs.h>
+
+#include <asm/amd/nb.h>
+
+#include "cpu.h"
+
+/*
+ * L3 cache descriptors
+ */
+static void amd_calc_l3_indices(struct amd_northbridge *nb)
+{
+ struct amd_l3_cache *l3 = &nb->l3_cache;
+ unsigned int sc0, sc1, sc2, sc3;
+ u32 val = 0;
+
+ pci_read_config_dword(nb->misc, 0x1C4, &val);
+
+ /* calculate subcache sizes */
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+
+ if (boot_cpu_data.x86 == 0x15) {
+ l3->subcaches[0] = sc0 += !(val & BIT(1));
+ l3->subcaches[1] = sc1 += !(val & BIT(5));
+ }
+
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+ l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
+}
+
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot)
+{
+ unsigned int reg = 0;
+
+ pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
+
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot)
+{
+ int index;
+ struct amd_northbridge *nb = ci->priv;
+
+ index = amd_get_l3_disable_slot(nb, slot);
+ if (index >= 0)
+ return sysfs_emit(buf, "%d\n", index);
+
+ return sysfs_emit(buf, "FREE\n");
+}
+
+#define SHOW_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return show_cache_disable(ci, buf, slot); \
+}
+
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
+
+static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long idx)
+{
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!nb->l3_cache.subcaches[i])
+ continue;
+
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu,
+ unsigned int slot, unsigned long index)
+{
+ int ret = 0;
+
+ /* check if @slot is already used or the index is already disabled */
+ ret = amd_get_l3_disable_slot(nb, slot);
+ if (ret >= 0)
+ return -EEXIST;
+
+ if (index > nb->l3_cache.indices)
+ return -EINVAL;
+
+ /* check whether the other slot has disabled the same index already */
+ if (index == amd_get_l3_disable_slot(nb, !slot))
+ return -EEXIST;
+
+ amd_l3_disable_index(nb, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf,
+ size_t count, unsigned int slot)
+{
+ struct amd_northbridge *nb = ci->priv;
+ unsigned long val = 0;
+ int cpu, err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ cpu = cpumask_first(&ci->shared_cpu_map);
+
+ if (kstrtoul(buf, 10, &val) < 0)
+ return -EINVAL;
+
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ pr_warn("L3 slot %d in use/index already disabled!\n",
+ slot);
+ return err;
+ }
+ return count;
+}
+
+#define STORE_CACHE_DISABLE(slot) \
+static ssize_t \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
+{ \
+ struct cacheinfo *ci = dev_get_drvdata(dev); \
+ return store_cache_disable(ci, buf, count, slot); \
+}
+
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
+
+static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+
+ return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu));
+}
+
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&ci->shared_cpu_map);
+ unsigned long val;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (kstrtoul(buf, 16, &val) < 0)
+ return -EINVAL;
+
+ if (amd_set_subcaches(cpu, val))
+ return -EINVAL;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *ci = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!ci->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ static struct attribute **amd_l3_attrs;
+ int n = 1;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci)
+{
+ struct amd_northbridge *nb = ci->priv;
+
+ if (ci->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ struct amd_northbridge *nb;
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return NULL;
+
+ node = topology_amd_node_id(smp_processor_id());
+ nb = node_to_amd_nb(node);
+ if (nb && !nb->l3_cache.indices)
+ amd_calc_l3_indices(nb);
+
+ return nb;
+}
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
new file mode 100644
index 000000000000..7ffc78d5ebf2
--- /dev/null
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * x86 APERF/MPERF KHz calculation for
+ * /sys/.../cpufreq/scaling_cur_freq
+ *
+ * Copyright (C) 2017 Intel Corp.
+ * Author: Len Brown <len.brown@intel.com>
+ */
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/ktime.h>
+#include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/topology.h>
+#include <linux/smp.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+struct aperfmperf {
+ seqcount_t seq;
+ unsigned long last_update;
+ u64 acnt;
+ u64 mcnt;
+ u64 aperf;
+ u64 mperf;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = {
+ .seq = SEQCNT_ZERO(cpu_samples.seq)
+};
+
+static void init_counter_refs(void *data)
+{
+ u64 aperf, mperf;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(cpu_samples.aperf, aperf);
+ this_cpu_write(cpu_samples.mperf, mperf);
+}
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * An exception to the heuristic above is the Atom uarch, where we choose the
+ * highest turbo level for freq_max since Atom's are generally oriented towards
+ * power efficiency.
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+void arch_set_max_freq_ratio(bool turbo_disabled)
+{
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
+ arch_turbo_freq_ratio;
+}
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
+
+static bool __init turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ int err;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
+
+ return true;
+}
+
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
+ int num_delta_fratio)
+{
+ int fratio, delta_fratio, found;
+ int err, i;
+ u64 msr;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ fratio = (msr >> 8) & 0xFF;
+ i = 16;
+ found = 0;
+ do {
+ if (found >= num_delta_fratio) {
+ *turbo_freq = fratio;
+ return true;
+ }
+
+ delta_fratio = (msr >> (i + 5)) & 0x7;
+
+ if (delta_fratio) {
+ found += 1;
+ fratio -= delta_fratio;
+ }
+
+ i += 8;
+ } while (i < 64);
+
+ return true;
+}
+
+static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
+{
+ u64 ratios, counts;
+ u32 group_size;
+ int err, i;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
+ if (err)
+ return false;
+
+ for (i = 0; i < 64; i += 8) {
+ group_size = (counts >> i) & 0xFF;
+ if (group_size >= size) {
+ *turbo_freq = (ratios >> i) & 0xFF;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
+{
+ u64 msr;
+ int err;
+
+ err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr);
+ if (err)
+ return false;
+
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
+
+ /* The CPU may have less than 4 cores */
+ if (!*turbo_freq)
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
+
+ return true;
+}
+
+static bool __init intel_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
+
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
+ goto out;
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
+ goto out;
+
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
+ goto out;
+
+ return false;
+
+out:
+ /*
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
+ * but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
+ */
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
+ return false;
+ }
+
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
+ arch_set_max_freq_ratio(turbo_disabled());
+
+ return true;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static const struct syscore_ops freq_invariance_syscore_ops = {
+ .resume = init_counter_refs,
+};
+
+static struct syscore freq_invariance_syscore = {
+ .ops = &freq_invariance_syscore_ops,
+};
+
+static void register_freq_invariance_syscore(void)
+{
+ register_syscore(&freq_invariance_syscore);
+}
+#else
+static inline void register_freq_invariance_syscore(void) {}
+#endif
+
+static void freq_invariance_enable(void)
+{
+ if (static_branch_unlikely(&arch_scale_freq_key)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
+ register_freq_invariance_syscore();
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
+}
+
+void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
+{
+ arch_turbo_freq_ratio = ratio;
+ arch_set_max_freq_ratio(turbo_disabled);
+ freq_invariance_enable();
+}
+
+static void __init bp_init_freq_invariance(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
+ freq_invariance_enable();
+ }
+}
+
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
+static void scale_freq_tick(u64 acnt, u64 mcnt)
+{
+ u64 freq_scale, freq_ratio;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
+ goto error;
+
+ freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void bp_init_freq_invariance(void) { }
+static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
+#endif /* CONFIG_X86_64 && CONFIG_SMP */
+
+void arch_scale_freq_tick(void)
+{
+ struct aperfmperf *s = this_cpu_ptr(&cpu_samples);
+ u64 acnt, mcnt, aperf, mperf;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return;
+
+ rdmsrq(MSR_IA32_APERF, aperf);
+ rdmsrq(MSR_IA32_MPERF, mperf);
+ acnt = aperf - s->aperf;
+ mcnt = mperf - s->mperf;
+
+ s->aperf = aperf;
+ s->mperf = mperf;
+
+ raw_write_seqcount_begin(&s->seq);
+ s->last_update = jiffies;
+ s->acnt = acnt;
+ s->mcnt = mcnt;
+ raw_write_seqcount_end(&s->seq);
+
+ scale_freq_tick(acnt, mcnt);
+}
+
+/*
+ * Discard samples older than the define maximum sample age of 20ms. There
+ * is no point in sending IPIs in such a case. If the scheduler tick was
+ * not running then the CPU is either idle or isolated.
+ */
+#define MAX_SAMPLE_AGE ((unsigned long)HZ / 50)
+
+int arch_freq_get_on_cpu(int cpu)
+{
+ struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu);
+ unsigned int seq, freq;
+ unsigned long last;
+ u64 acnt, mcnt;
+
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ goto fallback;
+
+ do {
+ seq = raw_read_seqcount_begin(&s->seq);
+ last = s->last_update;
+ acnt = s->acnt;
+ mcnt = s->mcnt;
+ } while (read_seqcount_retry(&s->seq, seq));
+
+ /*
+ * Bail on invalid count and when the last update was too long ago,
+ * which covers idle and NOHZ full CPUs.
+ */
+ if (!mcnt || (jiffies - last) > MAX_SAMPLE_AGE)
+ goto fallback;
+
+ return div64_u64((cpu_khz * acnt), mcnt);
+
+fallback:
+ freq = cpufreq_quick_get(cpu);
+ return freq ? freq : cpu_khz;
+}
+
+static int __init bp_init_aperfmperf(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ init_counter_refs(NULL);
+ bp_init_freq_invariance();
+ return 0;
+}
+early_initcall(bp_init_aperfmperf);
+
+void ap_init_aperfmperf(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_APERFMPERF))
+ init_counter_refs(NULL);
+}
diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c
new file mode 100644
index 000000000000..f1a8ca3dd1ed
--- /dev/null
+++ b/arch/x86/kernel/cpu/bhyve.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FreeBSD Bhyve guest enlightenments
+ *
+ * Copyright © 2025 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+
+static uint32_t bhyve_cpuid_base;
+static uint32_t bhyve_cpuid_max;
+
+#define BHYVE_SIGNATURE "bhyve bhyve "
+
+#define CPUID_BHYVE_FEATURES 0x40000001
+
+/* Features advertised in CPUID_BHYVE_FEATURES %eax */
+
+/* MSI Extended Dest ID */
+#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0)
+
+static uint32_t __init bhyve_detect(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return 0;
+
+ bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0);
+ if (!bhyve_cpuid_base)
+ return 0;
+
+ bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base);
+ return bhyve_cpuid_max;
+}
+
+static uint32_t bhyve_features(void)
+{
+ unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES;
+
+ if (bhyve_cpuid_max < cpuid_leaf)
+ return 0;
+
+ return cpuid_eax(cpuid_leaf);
+}
+
+static bool __init bhyve_ext_dest_id(void)
+{
+ return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID);
+}
+
+static bool __init bhyve_x2apic_available(void)
+{
+ return true;
+}
+
+const struct hypervisor_x86 x86_hyper_bhyve __refconst = {
+ .name = "Bhyve",
+ .detect = bhyve_detect,
+ .init.init_platform = x86_init_noop,
+ .init.x2apic_available = bhyve_x2apic_available,
+ .init.msi_ext_dest_id = bhyve_ext_dest_id,
+};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 03445346ee0a..d0a2847a4bb0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 1994 Linus Torvalds
*
@@ -8,87 +9,3730 @@
* - Andrew D. Balsa (code cleanup).
*/
#include <linux/init.h>
-#include <linux/utsname.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/prctl.h>
+#include <linux/sched/smt.h>
+#include <linux/pgtable.h>
+#include <linux/bpf.h>
+#include <linux/kvm_types.h>
+
+#include <asm/spec-ctrl.h>
+#include <asm/cmdline.h>
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
#include <asm/msr.h>
+#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/alternative.h>
+#include <asm/cpu_device_id.h>
+#include <asm/e820/api.h>
+#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
+#include "cpu.h"
/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
+ * Speculation Vulnerability Handling
+ *
+ * Each vulnerability is handled with the following functions:
+ * <vuln>_select_mitigation() -- Selects a mitigation to use. This should
+ * take into account all relevant command line
+ * options.
+ * <vuln>_update_mitigation() -- This is called after all vulnerabilities have
+ * selected a mitigation, in case the selection
+ * may want to change based on other choices
+ * made. This function is optional.
+ * <vuln>_apply_mitigation() -- Enable the selected mitigation.
*
- * We should really only care about bugs here
- * anyway. Not features.
+ * The compile-time mitigation in all cases should be AUTO. An explicit
+ * command-line option can override AUTO. If no such option is
+ * provided, <vuln>_select_mitigation() will override AUTO to the best
+ * mitigation option.
+ */
+
+/* The base value of the SPEC_CTRL MSR without task-specific bits set */
+u64 x86_spec_ctrl_base;
+
+/* The current value of the SPEC_CTRL MSR with task-specific bits set */
+DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
+
+/*
+ * Set when the CPU has run a potentially malicious guest. An IBPB will
+ * be needed to before running userspace. That IBPB will flush the branch
+ * predictor content.
+ */
+DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user);
+
+u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
+
+static u64 __ro_after_init x86_arch_cap_msr;
+
+static DEFINE_MUTEX(spec_ctrl_mutex);
+
+void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
+
+static void __init set_return_thunk(void *thunk)
+{
+ x86_return_thunk = thunk;
+
+ pr_info("active return thunk: %ps\n", thunk);
+}
+
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+/*
+ * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
+ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-static void __init check_fpu(void)
+void update_spec_ctrl_cond(u64 val)
{
- s32 fdiv_bug;
+ if (this_cpu_read(x86_spec_ctrl_current) == val)
+ return;
- kernel_fpu_begin();
+ this_cpu_write(x86_spec_ctrl_current, val);
/*
- * trap_init() enabled FXSR and company _before_ testing for FP
- * problems here.
- *
- * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+ * When KERNEL_IBRS this MSR is written on return-to-user, unless
+ * forced the update can be delayed until that time.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ wrmsrq(MSR_IA32_SPEC_CTRL, val);
+}
+
+noinstr u64 spec_ctrl_current(void)
+{
+ return this_cpu_read(x86_spec_ctrl_current);
+}
+EXPORT_SYMBOL_GPL(spec_ctrl_current);
+
+/*
+ * AMD specific MSR info for Speculative Store Bypass control.
+ * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu().
+ */
+u64 __ro_after_init x86_amd_ls_cfg_base;
+u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
+
+/* Control conditional STIBP in switch_to() */
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
+/* Control conditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
+/* Control unconditional IBPB in switch_mm() */
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
+
+/* Control IBPB on vCPU load */
+DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb);
+EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb);
+
+/* Control CPU buffer clear before idling (halt, mwait) */
+DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear);
+EXPORT_SYMBOL_GPL(cpu_buf_idle_clear);
+
+/*
+ * Controls whether l1d flush based mitigations are enabled,
+ * based on hw features and admin setting via boot parameter
+ * defaults to false
+ */
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mitigations: " fmt
+
+static void __init cpu_print_attack_vectors(void)
+{
+ pr_info("Enabled attack vectors: ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ pr_cont("user_kernel, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER))
+ pr_cont("user_user, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST))
+ pr_cont("guest_host, ");
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST))
+ pr_cont("guest_guest, ");
+
+ pr_cont("SMT mitigations: ");
+
+ switch (smt_mitigations) {
+ case SMT_MITIGATIONS_OFF:
+ pr_cont("off\n");
+ break;
+ case SMT_MITIGATIONS_AUTO:
+ pr_cont("auto\n");
+ break;
+ case SMT_MITIGATIONS_ON:
+ pr_cont("on\n");
+ }
+}
+
+/*
+ * NOTE: This function is *only* called for SVM, since Intel uses
+ * MSR_IA32_SPEC_CTRL for SSBD.
+ */
+void
+x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest)
+{
+ u64 guestval, hostval;
+ struct thread_info *ti = current_thread_info();
+
+ /*
+ * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update
+ * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported.
+ */
+ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
+ !static_cpu_has(X86_FEATURE_VIRT_SSBD))
+ return;
+
+ /*
+ * If the host has SSBD mitigation enabled, force it in the host's
+ * virtual MSR value. If its not permanently enabled, evaluate
+ * current's TIF_SSBD thread flag.
+ */
+ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE))
+ hostval = SPEC_CTRL_SSBD;
+ else
+ hostval = ssbd_tif_to_spec_ctrl(ti->flags);
+
+ /* Sanitize the guest value */
+ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD;
+
+ if (hostval != guestval) {
+ unsigned long tif;
+
+ tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
+ ssbd_spec_ctrl_to_tif(hostval);
+
+ speculation_ctrl_update(tif);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl);
+
+static void x86_amd_ssb_disable(void)
+{
+ u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask;
+
+ if (boot_cpu_has(X86_FEATURE_VIRT_SSBD))
+ wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD);
+ else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
+ wrmsrq(MSR_AMD64_LS_CFG, msrval);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MDS: " fmt
+
+/*
+ * Returns true if vulnerability should be mitigated based on the
+ * selected attack vector controls.
+ *
+ * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst
+ */
+static bool __init should_mitigate_vuln(unsigned int bug)
+{
+ switch (bug) {
+ /*
+ * The only runtime-selected spectre_v1 mitigations in the kernel are
+ * related to SWAPGS protection on kernel entry. Therefore, protection
+ * is only required for the user->kernel attack vector.
+ */
+ case X86_BUG_SPECTRE_V1:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL);
+
+ case X86_BUG_SPECTRE_V2:
+ case X86_BUG_RETBLEED:
+ case X86_BUG_L1TF:
+ case X86_BUG_ITS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ case X86_BUG_SPECTRE_V2_USER:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ /*
+ * All the vulnerabilities below allow potentially leaking data
+ * across address spaces. Therefore, mitigation is required for
+ * any of these 4 attack vectors.
+ */
+ case X86_BUG_MDS:
+ case X86_BUG_TAA:
+ case X86_BUG_MMIO_STALE_DATA:
+ case X86_BUG_RFDS:
+ case X86_BUG_SRBDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST);
+
+ case X86_BUG_GDS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) ||
+ (smt_mitigations != SMT_MITIGATIONS_OFF);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER);
+
+ case X86_BUG_VMSCAPE:
+ return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST);
+
+ default:
+ WARN(1, "Unknown bug %x\n", bug);
+ return false;
+ }
+}
+
+/* Default mitigation for MDS-affected CPUs */
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF;
+static bool mds_nosmt __ro_after_init = false;
+
+static const char * const mds_strings[] = {
+ [MDS_MITIGATION_OFF] = "Vulnerable",
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+};
+
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_AUTO,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
+/* Default mitigation for TAA-affected CPUs */
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF;
+
+enum mmio_mitigations {
+ MMIO_MITIGATION_OFF,
+ MMIO_MITIGATION_AUTO,
+ MMIO_MITIGATION_UCODE_NEEDED,
+ MMIO_MITIGATION_VERW,
+};
+
+/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF;
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_AUTO,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF;
+
+/*
+ * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to
+ * userspace *and* on entry to KVM guests.
+ */
+static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init;
+
+static void __init mds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS)) {
+ mds_mitigation = MDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MDS))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else
+ mds_mitigation = MDS_MITIGATION_OFF;
+ }
+
+ if (mds_mitigation == MDS_MITIGATION_OFF)
+ return;
+
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return;
+
+ /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mds_mitigation = MDS_MITIGATION_FULL;
+
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ mds_mitigation = MDS_MITIGATION_VMWERV;
+ }
+
+ pr_info("%s\n", mds_strings[mds_mitigation]);
+}
+
+static void __init mds_apply_mitigation(void)
+{
+ if (mds_mitigation == MDS_MITIGATION_FULL ||
+ mds_mitigation == MDS_MITIGATION_VMWERV) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
+ (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init mds_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ mds_mitigation = MDS_MITIGATION_OFF;
+ else if (!strcmp(str, "full"))
+ mds_mitigation = MDS_MITIGATION_FULL;
+ else if (!strcmp(str, "full,nosmt")) {
+ mds_mitigation = MDS_MITIGATION_FULL;
+ mds_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mds", mds_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "TAA: " fmt
+
+static bool taa_nosmt __ro_after_init;
+
+static const char * const taa_strings[] = {
+ [TAA_MITIGATION_OFF] = "Vulnerable",
+ [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+ [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled",
+};
+
+static bool __init taa_vulnerable(void)
+{
+ return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM);
+}
+
+static void __init taa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA)) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ return;
+ }
+
+ /* TSX previously disabled by tsx=off */
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ taa_mitigation = TAA_MITIGATION_TSX_DISABLED;
+ return;
+ }
+
+ /* Microcode will be checked in taa_update_mitigation(). */
+ if (taa_mitigation == TAA_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_TAA))
+ taa_mitigation = TAA_MITIGATION_VERW;
+ else
+ taa_mitigation = TAA_MITIGATION_OFF;
+ }
+
+ if (taa_mitigation != TAA_MITIGATION_OFF)
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init taa_update_mitigation(void)
+{
+ if (!taa_vulnerable())
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ taa_mitigation = TAA_MITIGATION_VERW;
+
+ if (taa_mitigation == TAA_MITIGATION_VERW) {
+ /* Check if the requisite ucode is available. */
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1.
+ * A microcode update fixes this behavior to clear CPU buffers. It also
+ * adds support for MSR_IA32_TSX_CTRL which is enumerated by the
+ * ARCH_CAP_TSX_CTRL_MSR bit.
+ *
+ * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
+ * update is required.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
+ taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", taa_strings[taa_mitigation]);
+}
+
+static void __init taa_apply_mitigation(void)
+{
+ if (taa_mitigation == TAA_MITIGATION_VERW ||
+ taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) {
+ /*
+ * TSX is enabled, select alternate mitigation for TAA which is
+ * the same as MDS. Enable MDS static branch to clear CPU buffers.
+ *
+ * For guests that can't determine whether the correct microcode is
+ * present on host, enable the mitigation for UCODE_NEEDED as well.
+ */
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+
+ if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
+ }
+}
+
+static int __init tsx_async_abort_parse_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TAA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ taa_mitigation = TAA_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ taa_mitigation = TAA_MITIGATION_VERW;
+ taa_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "MMIO Stale Data: " fmt
+
+static bool mmio_nosmt __ro_after_init = false;
+
+static const char * const mmio_strings[] = {
+ [MMIO_MITIGATION_OFF] = "Vulnerable",
+ [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode",
+ [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers",
+};
+
+static void __init mmio_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ return;
+ }
+
+ /* Microcode will be checked in mmio_update_mitigation(). */
+ if (mmio_mitigation == MMIO_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA))
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ else
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ }
+
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Enable CPU buffer clear mitigation for host and VMM, if also affected
+ * by MDS or TAA.
*/
- __asm__("fninit\n\t"
- "fldl %1\n\t"
- "fdivl %2\n\t"
- "fmull %2\n\t"
- "fldl %1\n\t"
- "fsubp %%st,%%st(1)\n\t"
- "fistpl %0\n\t"
- "fwait\n\t"
- "fninit"
- : "=m" (*&fdiv_bug)
- : "m" (*&x), "m" (*&y));
+ if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init mmio_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return;
- kernel_fpu_end();
+ if (verw_clear_cpu_buf_mitigation_selected)
+ mmio_mitigation = MMIO_MITIGATION_VERW;
- if (fdiv_bug) {
- set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
- pr_warn("Hmm, FPU with FDIV bug\n");
+ if (mmio_mitigation == MMIO_MITIGATION_VERW) {
+ /*
+ * Check if the system has the right microcode.
+ *
+ * CPU Fill buffer clear mitigation is enumerated by either an explicit
+ * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
+ * affected systems.
+ */
+ if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
+ (boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
+ boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))))
+ mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
}
+
+ pr_info("%s\n", mmio_strings[mmio_mitigation]);
+}
+
+static void __init mmio_apply_mitigation(void)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return;
+
+ /*
+ * Only enable the VMM mitigation if the CPU buffer clear mitigation is
+ * not being used.
+ */
+ if (verw_clear_cpu_buf_mitigation_selected) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO);
+ }
+
+ /*
+ * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can
+ * be propagated to uncore buffers, clearing the Fill buffers on idle
+ * is required irrespective of SMT state.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
+ static_branch_enable(&cpu_buf_idle_clear);
+
+ if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)
+ cpu_smt_disable(false);
}
-void __init check_bugs(void)
+static int __init mmio_stale_data_parse_cmdline(char *str)
{
- identify_boot_cpu();
-#ifndef CONFIG_SMP
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
+ if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ mmio_mitigation = MMIO_MITIGATION_OFF;
+ } else if (!strcmp(str, "full")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ } else if (!strcmp(str, "full,nosmt")) {
+ mmio_mitigation = MMIO_MITIGATION_VERW;
+ mmio_nosmt = true;
+ }
+
+ return 0;
+}
+early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static inline bool __init verw_clears_cpu_reg_file(void)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR);
+}
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_RFDS))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ else
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ }
+
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (verw_clears_cpu_reg_file())
+ verw_clear_cpu_buf_mitigation_selected = true;
+}
+
+static void __init rfds_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return;
+
+ if (verw_clear_cpu_buf_mitigation_selected)
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ if (!verw_clears_cpu_reg_file())
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+ }
+
+ pr_info("%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static void __init rfds_apply_mitigation(void)
+{
+ if (rfds_mitigation == RFDS_MITIGATION_VERW) {
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ }
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "SRBDS: " fmt
+
+enum srbds_mitigations {
+ SRBDS_MITIGATION_OFF,
+ SRBDS_MITIGATION_AUTO,
+ SRBDS_MITIGATION_UCODE_NEEDED,
+ SRBDS_MITIGATION_FULL,
+ SRBDS_MITIGATION_TSX_OFF,
+ SRBDS_MITIGATION_HYPERVISOR,
+};
+
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF;
+
+static const char * const srbds_strings[] = {
+ [SRBDS_MITIGATION_OFF] = "Vulnerable",
+ [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
+ [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+static bool srbds_off;
+
+void update_srbds_msr(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
+ return;
+
+ /*
+ * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX
+ * being disabled and it hasn't received the SRBDS MSR microcode.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ switch (srbds_mitigation) {
+ case SRBDS_MITIGATION_OFF:
+ case SRBDS_MITIGATION_TSX_OFF:
+ mcu_ctrl |= RNGDS_MITG_DIS;
+ break;
+ case SRBDS_MITIGATION_FULL:
+ mcu_ctrl &= ~RNGDS_MITG_DIS;
+ break;
+ default:
+ break;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+}
+
+static void __init srbds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS)) {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+
+ if (srbds_mitigation == SRBDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SRBDS))
+ srbds_mitigation = SRBDS_MITIGATION_FULL;
+ else {
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+ return;
+ }
+ }
+
+ /*
+ * Check to see if this is one of the MDS_NO systems supporting TSX that
+ * are only exposed to SRBDS when TSX is enabled or when CPU is affected
+ * by Processor MMIO Stale Data vulnerability.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
+ srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
+ else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
+ else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
+ srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
+ else if (srbds_off)
+ srbds_mitigation = SRBDS_MITIGATION_OFF;
+
+ pr_info("%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static void __init srbds_apply_mitigation(void)
+{
+ update_srbds_msr();
+}
+
+static int __init srbds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_SRBDS))
+ return 0;
+
+ srbds_off = !strcmp(str, "off");
+ return 0;
+}
+early_param("srbds", srbds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1D Flush : " fmt
+
+enum l1d_flush_mitigations {
+ L1D_FLUSH_OFF = 0,
+ L1D_FLUSH_ON,
+};
+
+static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF;
+
+static void __init l1d_flush_select_mitigation(void)
+{
+ if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ return;
+
+ static_branch_enable(&switch_mm_cond_l1d_flush);
+ pr_info("Conditional flush on switch_mm() enabled\n");
+}
+
+static int __init l1d_flush_parse_cmdline(char *str)
+{
+ if (!strcmp(str, "on"))
+ l1d_flush_mitigation = L1D_FLUSH_ON;
+
+ return 0;
+}
+early_param("l1d_flush", l1d_flush_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "GDS: " fmt
+
+enum gds_mitigations {
+ GDS_MITIGATION_OFF,
+ GDS_MITIGATION_AUTO,
+ GDS_MITIGATION_UCODE_NEEDED,
+ GDS_MITIGATION_FORCE,
+ GDS_MITIGATION_FULL,
+ GDS_MITIGATION_FULL_LOCKED,
+ GDS_MITIGATION_HYPERVISOR,
+};
+
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF;
+
+static const char * const gds_strings[] = {
+ [GDS_MITIGATION_OFF] = "Vulnerable",
+ [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode",
+ [GDS_MITIGATION_FULL] = "Mitigation: Microcode",
+ [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)",
+ [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
+};
+
+bool gds_ucode_mitigated(void)
+{
+ return (gds_mitigation == GDS_MITIGATION_FULL ||
+ gds_mitigation == GDS_MITIGATION_FULL_LOCKED);
+}
+EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated);
+
+void update_gds_msr(void)
+{
+ u64 mcu_ctrl_after;
+ u64 mcu_ctrl;
+
+ switch (gds_mitigation) {
+ case GDS_MITIGATION_OFF:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl |= GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FULL_LOCKED:
+ /*
+ * The LOCKED state comes from the boot CPU. APs might not have
+ * the same state. Make sure the mitigation is enabled on all
+ * CPUs.
+ */
+ case GDS_MITIGATION_FULL:
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ mcu_ctrl &= ~GDS_MITG_DIS;
+ break;
+ case GDS_MITIGATION_FORCE:
+ case GDS_MITIGATION_UCODE_NEEDED:
+ case GDS_MITIGATION_HYPERVISOR:
+ case GDS_MITIGATION_AUTO:
+ return;
+ }
+
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+
+ /*
+ * Check to make sure that the WRMSR value was not ignored. Writes to
+ * GDS_MITG_DIS will be ignored if this processor is locked but the boot
+ * processor was not.
+ */
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after);
+ WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after);
+}
+
+static void __init gds_select_mitigation(void)
+{
+ u64 mcu_ctrl;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ gds_mitigation = GDS_MITIGATION_HYPERVISOR;
+ return;
+ }
+
+ /* Will verify below that mitigation _can_ be disabled */
+ if (gds_mitigation == GDS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_GDS))
+ gds_mitigation = GDS_MITIGATION_FULL;
+ else
+ gds_mitigation = GDS_MITIGATION_OFF;
+ }
+
+ /* No microcode */
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
+ if (gds_mitigation != GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_UCODE_NEEDED;
+ return;
+ }
+
+ /* Microcode has mitigation, use it */
+ if (gds_mitigation == GDS_MITIGATION_FORCE)
+ gds_mitigation = GDS_MITIGATION_FULL;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
+ if (mcu_ctrl & GDS_MITG_LOCKED) {
+ if (gds_mitigation == GDS_MITIGATION_OFF)
+ pr_warn("Mitigation locked. Disable failed.\n");
+
+ /*
+ * The mitigation is selected from the boot CPU. All other CPUs
+ * _should_ have the same state. If the boot CPU isn't locked
+ * but others are then update_gds_msr() will WARN() of the state
+ * mismatch. If the boot CPU is locked update_gds_msr() will
+ * ensure the other CPUs have the mitigation enabled.
+ */
+ gds_mitigation = GDS_MITIGATION_FULL_LOCKED;
+ }
+}
+
+static void __init gds_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return;
+
+ /* Microcode is present */
+ if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)
+ update_gds_msr();
+ else if (gds_mitigation == GDS_MITIGATION_FORCE) {
+ /*
+ * This only needs to be done on the boot CPU so do it
+ * here rather than in update_gds_msr()
+ */
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ pr_warn("Microcode update needed! Disabling AVX as mitigation.\n");
+ }
+
+ pr_info("%s\n", gds_strings[gds_mitigation]);
+}
+
+static int __init gds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ gds_mitigation = GDS_MITIGATION_OFF;
+ else if (!strcmp(str, "force"))
+ gds_mitigation = GDS_MITIGATION_FORCE;
+
+ return 0;
+}
+early_param("gather_data_sampling", gds_parse_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V1 : " fmt
+
+enum spectre_v1_mitigation {
+ SPECTRE_V1_MITIGATION_NONE,
+ SPECTRE_V1_MITIGATION_AUTO,
+};
+
+static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
+
+static const char * const spectre_v1_strings[] = {
+ [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
+ [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization",
+};
+
+/*
+ * Does SMAP provide full mitigation against speculative kernel access to
+ * userspace?
+ */
+static bool smap_works_speculatively(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SMAP))
+ return false;
+
+ /*
+ * On CPUs which are vulnerable to Meltdown, SMAP does not
+ * prevent speculative access to user data in the L1 cache.
+ * Consider SMAP to be non-functional as a mitigation on these
+ * CPUs.
+ */
+ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN))
+ return false;
+
+ return true;
+}
+
+static void __init spectre_v1_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1))
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+}
+
+static void __init spectre_v1_apply_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+ return;
+
+ if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) {
+ /*
+ * With Spectre v1, a user can speculatively control either
+ * path of a conditional swapgs with a user-controlled GS
+ * value. The mitigation is to add lfences to both code paths.
+ *
+ * If FSGSBASE is enabled, the user can put a kernel address in
+ * GS, in which case SMAP provides no protection.
+ *
+ * If FSGSBASE is disabled, the user can only put a user space
+ * address in GS. That makes an attack harder, but still
+ * possible if there's no SMAP protection.
+ */
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
+ /*
+ * Mitigation can be provided from SWAPGS itself or
+ * PTI as the CR3 write in the Meltdown mitigation
+ * is serializing.
+ *
+ * If neither is there, mitigate with an LFENCE to
+ * stop speculation through swapgs.
+ */
+ if (boot_cpu_has_bug(X86_BUG_SWAPGS) &&
+ !boot_cpu_has(X86_FEATURE_PTI))
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER);
+
+ /*
+ * Enable lfences in the kernel entry (non-swapgs)
+ * paths, to prevent user entry from speculatively
+ * skipping swapgs.
+ */
+ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL);
+ }
+ }
+
+ pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+}
+
+static int __init nospectre_v1_cmdline(char *str)
+{
+ spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE;
+ return 0;
+}
+early_param("nospectre_v1", nospectre_v1_cmdline);
+
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
+
+/* Depends on spectre_v2 mitigation selected already */
+static inline bool cdt_possible(enum spectre_v2_mitigation mode)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE))
+ return false;
+
+ if (mode == SPECTRE_V2_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE)
+ return true;
+
+ return false;
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RETBleed: " fmt
+
+enum its_mitigation {
+ ITS_MITIGATION_OFF,
+ ITS_MITIGATION_AUTO,
+ ITS_MITIGATION_VMEXIT_ONLY,
+ ITS_MITIGATION_ALIGNED_THUNKS,
+ ITS_MITIGATION_RETPOLINE_STUFF,
+};
+
+static enum its_mitigation its_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF;
+
+enum retbleed_mitigation {
+ RETBLEED_MITIGATION_NONE,
+ RETBLEED_MITIGATION_AUTO,
+ RETBLEED_MITIGATION_UNRET,
+ RETBLEED_MITIGATION_IBPB,
+ RETBLEED_MITIGATION_IBRS,
+ RETBLEED_MITIGATION_EIBRS,
+ RETBLEED_MITIGATION_STUFF,
+};
+
+static const char * const retbleed_strings[] = {
+ [RETBLEED_MITIGATION_NONE] = "Vulnerable",
+ [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk",
+ [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
+ [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
+};
+
+static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE;
+
+static int __ro_after_init retbleed_nosmt = false;
+
+enum srso_mitigation {
+ SRSO_MITIGATION_NONE,
+ SRSO_MITIGATION_AUTO,
+ SRSO_MITIGATION_UCODE_NEEDED,
+ SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED,
+ SRSO_MITIGATION_MICROCODE,
+ SRSO_MITIGATION_NOSMT,
+ SRSO_MITIGATION_SAFE_RET,
+ SRSO_MITIGATION_IBPB,
+ SRSO_MITIGATION_IBPB_ON_VMEXIT,
+ SRSO_MITIGATION_BP_SPEC_REDUCE,
+};
+
+static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO;
+
+static int __init retbleed_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "off")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (!strcmp(str, "auto")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!strcmp(str, "unret")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ } else if (!strcmp(str, "ibpb")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+ } else if (!strcmp(str, "nosmt")) {
+ retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ } else {
+ pr_err("Ignoring unknown retbleed option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("retbleed", retbleed_parse_cmdline);
+
+#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
+#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
+
+static void __init retbleed_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_UNRET:
+ if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
+ }
+ break;
+ case RETBLEED_MITIGATION_IBPB:
+ if (!boot_cpu_has(X86_FEATURE_IBPB)) {
+ pr_err("WARNING: CPU does not support IBPB.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ case RETBLEED_MITIGATION_STUFF:
+ if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_AUTO;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_RETBLEED)) {
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ return;
+ }
+
+ /* Intel mitigation selected in retbleed_update_mitigation() */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
+ retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ /* Final mitigation depends on spectre-v2 selection */
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ else if (boot_cpu_has(X86_FEATURE_IBRS))
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ else
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+}
+
+static void __init retbleed_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RETBLEED))
+ return;
+
+ /* ITS can also enable stuffing */
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF)
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+
+ /* If SRSO is using IBPB, that works for retbleed too */
+ if (srso_mitigation == SRSO_MITIGATION_IBPB)
+ retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF &&
+ !cdt_possible(spectre_v2_enabled)) {
+ pr_err("WARNING: retbleed=stuff depends on retpoline\n");
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+
+ /*
+ * Let IBRS trump all on Intel without affecting the effects of the
+ * retbleed= cmdline option except for call depth based stuffing
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_IBRS:
+ retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
+ break;
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
+ break;
+ default:
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) {
+ if (retbleed_mitigation != RETBLEED_MITIGATION_NONE)
+ pr_err(RETBLEED_INTEL_MSG);
+
+ retbleed_mitigation = RETBLEED_MITIGATION_NONE;
+ }
+ }
+ }
+
+ pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static void __init retbleed_apply_mitigation(void)
+{
+ bool mitigate_smt = false;
+
+ switch (retbleed_mitigation) {
+ case RETBLEED_MITIGATION_NONE:
+ return;
+
+ case RETBLEED_MITIGATION_UNRET:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ set_return_thunk(retbleed_return_thunk);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ pr_err(RETBLEED_UNTRAIN_MSG);
+
+ mitigate_smt = true;
+ break;
+
+ case RETBLEED_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: write_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
+ break;
+
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+
+ set_return_thunk(call_depth_return_thunk);
+ break;
+
+ default:
+ break;
+ }
+
+ if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) &&
+ (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON))
+ cpu_smt_disable(false);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ITS: " fmt
+
+static const char * const its_strings[] = {
+ [ITS_MITIGATION_OFF] = "Vulnerable",
+ [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected",
+ [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks",
+ [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB",
+};
+
+static int __init its_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) {
+ pr_err("Mitigation disabled at compile time, ignoring option (%s)", str);
+ return 0;
+ }
+
+ if (!strcmp(str, "off")) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ } else if (!strcmp(str, "on")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ } else if (!strcmp(str, "force")) {
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ setup_force_cpu_bug(X86_BUG_ITS);
+ } else if (!strcmp(str, "vmexit")) {
+ its_mitigation = ITS_MITIGATION_VMEXIT_ONLY;
+ } else if (!strcmp(str, "stuff")) {
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ } else {
+ pr_err("Ignoring unknown indirect_target_selection option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("indirect_target_selection", its_parse_cmdline);
+
+static void __init its_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS)) {
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_ITS))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ else
+ its_mitigation = ITS_MITIGATION_OFF;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_OFF)
+ return;
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ||
+ !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) {
+ pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) {
+ pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ return;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) {
+ pr_err("RSB stuff mitigation not supported, using default\n");
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY &&
+ !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+}
+
+static void __init its_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ return;
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ if (its_mitigation != ITS_MITIGATION_OFF)
+ pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ /* Retpoline+CDT mitigates ITS */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF)
+ its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF;
+ break;
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n");
+ its_mitigation = ITS_MITIGATION_OFF;
+ break;
+ default:
+ break;
+ }
+
+ if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF &&
+ !cdt_possible(spectre_v2_enabled))
+ its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS;
+
+ pr_info("%s\n", its_strings[its_mitigation]);
+}
+
+static void __init its_apply_mitigation(void)
+{
+ switch (its_mitigation) {
+ case ITS_MITIGATION_OFF:
+ case ITS_MITIGATION_AUTO:
+ case ITS_MITIGATION_VMEXIT_ONLY:
+ break;
+ case ITS_MITIGATION_ALIGNED_THUNKS:
+ if (!boot_cpu_has(X86_FEATURE_RETPOLINE))
+ setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS);
+
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ set_return_thunk(its_return_thunk);
+ break;
+ case ITS_MITIGATION_RETPOLINE_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ set_return_thunk(call_depth_return_thunk);
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt
+
+enum tsa_mitigations {
+ TSA_MITIGATION_NONE,
+ TSA_MITIGATION_AUTO,
+ TSA_MITIGATION_UCODE_NEEDED,
+ TSA_MITIGATION_USER_KERNEL,
+ TSA_MITIGATION_VM,
+ TSA_MITIGATION_FULL,
+};
+
+static const char * const tsa_strings[] = {
+ [TSA_MITIGATION_NONE] = "Vulnerable",
+ [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary",
+ [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM",
+ [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
+};
+
+static enum tsa_mitigations tsa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE;
+
+static int __init tsa_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ else if (!strcmp(str, "on"))
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ else if (!strcmp(str, "user"))
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ else if (!strcmp(str, "vm"))
+ tsa_mitigation = TSA_MITIGATION_VM;
+ else
+ pr_err("Ignoring unknown tsa=%s option.\n", str);
+
+ return 0;
+}
+early_param("tsa", tsa_parse_cmdline);
+
+static void __init tsa_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_TSA)) {
+ tsa_mitigation = TSA_MITIGATION_NONE;
+ return;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_AUTO) {
+ bool vm = false, uk = false;
+
+ tsa_mitigation = TSA_MITIGATION_NONE;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) {
+ tsa_mitigation = TSA_MITIGATION_USER_KERNEL;
+ uk = true;
+ }
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ tsa_mitigation = TSA_MITIGATION_VM;
+ vm = true;
+ }
+
+ if (uk && vm)
+ tsa_mitigation = TSA_MITIGATION_FULL;
+ }
+
+ if (tsa_mitigation == TSA_MITIGATION_NONE)
+ return;
+
+ if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR))
+ tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED;
+
+ /*
+ * No need to set verw_clear_cpu_buf_mitigation_selected - it
+ * doesn't fit all cases here and it is not needed because this
+ * is the only VERW-based mitigation on AMD.
+ */
+ pr_info("%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static void __init tsa_apply_mitigation(void)
+{
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ break;
+ case TSA_MITIGATION_VM:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ case TSA_MITIGATION_FULL:
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Spectre V2 : " fmt
+
+static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
+ SPECTRE_V2_USER_NONE;
+
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static bool spectre_v2_bad_module;
+
+bool retpoline_module_ok(bool has_retpoline)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+ return true;
+
+ pr_err("System may be vulnerable to spectre v2\n");
+ spectre_v2_bad_module = true;
+ return false;
+}
+
+static inline const char *spectre_v2_module_string(void)
+{
+ return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
+#endif
+
+#define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
+#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
+#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
+
+#ifdef CONFIG_BPF_SYSCALL
+void unpriv_ebpf_notify(int new_state)
+{
+ if (new_state)
+ return;
+
+ /* Unprivileged eBPF is enabled */
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_EIBRS:
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+ break;
+ case SPECTRE_V2_EIBRS_LFENCE:
+ if (sched_smt_active())
+ pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+ break;
+ default:
+ break;
+ }
+}
#endif
+/* The kernel command line selection for spectre v2 */
+enum spectre_v2_mitigation_cmd {
+ SPECTRE_V2_CMD_NONE,
+ SPECTRE_V2_CMD_AUTO,
+ SPECTRE_V2_CMD_FORCE,
+ SPECTRE_V2_CMD_RETPOLINE,
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+ SPECTRE_V2_CMD_RETPOLINE_LFENCE,
+ SPECTRE_V2_CMD_EIBRS,
+ SPECTRE_V2_CMD_EIBRS_RETPOLINE,
+ SPECTRE_V2_CMD_EIBRS_LFENCE,
+ SPECTRE_V2_CMD_IBRS,
+};
+
+static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
+
+enum spectre_v2_user_mitigation_cmd {
+ SPECTRE_V2_USER_CMD_NONE,
+ SPECTRE_V2_USER_CMD_AUTO,
+ SPECTRE_V2_USER_CMD_FORCE,
+ SPECTRE_V2_USER_CMD_PRCTL,
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
+ SPECTRE_V2_USER_CMD_SECCOMP,
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
+};
+
+static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE;
+
+static const char * const spectre_v2_user_strings[] = {
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
+ [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection",
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
+};
+
+static int __init spectre_v2_user_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "auto"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO;
+ else if (!strcmp(str, "off"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE;
+ else if (!strcmp(str, "on"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE;
+ else if (!strcmp(str, "prctl"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL;
+ else if (!strcmp(str, "prctl,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB;
+ else if (!strcmp(str, "seccomp"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP;
+ else if (!strcmp(str, "seccomp,ibpb"))
+ spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB;
+ else
+ pr_err("Ignoring unknown spectre_v2_user option (%s).", str);
+
+ return 0;
+}
+early_param("spectre_v2_user", spectre_v2_user_parse_cmdline);
+
+static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
+}
+
+static void __init spectre_v2_user_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ switch (spectre_v2_user_cmd) {
+ case SPECTRE_V2_USER_CMD_NONE:
+ return;
+ case SPECTRE_V2_USER_CMD_FORCE:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ break;
+ case SPECTRE_V2_USER_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER))
+ break;
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ if (smt_mitigations == SMT_MITIGATIONS_OFF)
+ break;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP:
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL;
+ spectre_v2_user_stibp = spectre_v2_user_ibpb;
+ break;
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ if (IS_ENABLED(CONFIG_SECCOMP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP;
+ else
+ spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL;
+ break;
+ }
+
/*
- * Check whether we are able to run this kernel safely on SMP.
+ * At this point, an STIBP mode other than "off" has been set.
+ * If STIBP support is not being forced, check if STIBP always-on
+ * is preferred.
+ */
+ if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) &&
+ boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+
+ if (!boot_cpu_has(X86_FEATURE_STIBP))
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+}
+
+static void __init spectre_v2_user_update_mitigation(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
+ return;
+
+ /* The spectre_v2 cmd line can override spectre_v2_user options */
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) {
+ spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT;
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT;
+ }
+
+ /*
+ * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
*
- * - i386 is no longer supported.
- * - In order to run on anything without a TSC, we need to be
- * compiled for a i486.
+ * Intel's Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. AMD Automatic IBRS also does not protect userspace.
+ * These modes therefore disable the implicit cross-thread protection,
+ * so allow for STIBP to be selected in those cases.
*/
- if (boot_cpu_data.x86 < 4)
- panic("Kernel requires i486+ for 'invlpg' and other features");
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !cpu_smt_possible() ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))) {
+ spectre_v2_user_stibp = SPECTRE_V2_USER_NONE;
+ return;
+ }
+
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE &&
+ (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) {
+ if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT &&
+ spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED)
+ pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
+ spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED;
+ }
+ pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]);
+}
+
+static void __init spectre_v2_user_apply_mitigation(void)
+{
+ /* Initialize Indirect Branch Prediction Barrier */
+ if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) {
+ static_branch_enable(&switch_vcpu_ibpb);
- init_utsname()->machine[1] =
- '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
- alternative_instructions();
+ switch (spectre_v2_user_ibpb) {
+ case SPECTRE_V2_USER_STRICT:
+ static_branch_enable(&switch_mm_always_ibpb);
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ static_branch_enable(&switch_mm_cond_ibpb);
+ break;
+ default:
+ break;
+ }
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
+ static_key_enabled(&switch_mm_always_ibpb) ?
+ "always-on" : "conditional");
+ }
+}
+
+static const char * const spectre_v2_strings[] = {
+ [SPECTRE_V2_NONE] = "Vulnerable",
+ [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
+ [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
+ [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
+};
+
+static bool nospectre_v2 __ro_after_init;
+
+static int __init nospectre_v2_parse_cmdline(char *str)
+{
+ nospectre_v2 = true;
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return 0;
+}
+early_param("nospectre_v2", nospectre_v2_parse_cmdline);
+
+static int __init spectre_v2_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (nospectre_v2)
+ return 0;
+
+ if (!strcmp(str, "off")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ } else if (!strcmp(str, "on")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_FORCE;
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ } else if (!strcmp(str, "retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE;
+ } else if (!strcmp(str, "retpoline,amd") ||
+ !strcmp(str, "retpoline,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE;
+ } else if (!strcmp(str, "retpoline,generic")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+ } else if (!strcmp(str, "eibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS;
+ } else if (!strcmp(str, "eibrs,lfence")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE;
+ } else if (!strcmp(str, "eibrs,retpoline")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE;
+ } else if (!strcmp(str, "auto")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ } else if (!strcmp(str, "ibrs")) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_IBRS;
+ } else {
+ pr_err("Ignoring unknown spectre_v2 option (%s).", str);
+ }
+
+ return 0;
+}
+early_param("spectre_v2", spectre_v2_parse_cmdline);
+
+static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
+{
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("Kernel not compiled with retpoline; no mitigation available!");
+ return SPECTRE_V2_NONE;
+ }
+
+ return SPECTRE_V2_RETPOLINE;
+}
+
+static bool __ro_after_init rrsba_disabled;
+
+/* Disable in-kernel use of non-RSB RET predictors */
+static void __init spec_ctrl_disable_kernel_rrsba(void)
+{
+ if (rrsba_disabled)
+ return;
+
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
+}
+
+static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode)
+{
+ /*
+ * WARNING! There are many subtleties to consider when changing *any*
+ * code related to RSB-related mitigations. Before doing so, carefully
+ * read the following document, and update if necessary:
+ *
+ * Documentation/admin-guide/hw-vuln/rsb.rst
+ *
+ * In an overly simplified nutshell:
+ *
+ * - User->user RSB attacks are conditionally mitigated during
+ * context switches by cond_mitigation -> write_ibpb().
+ *
+ * - User->kernel and guest->host attacks are mitigated by eIBRS or
+ * RSB filling.
+ *
+ * Though, depending on config, note that other alternative
+ * mitigations may end up getting used instead, e.g., IBPB on
+ * entry/vmexit, call depth tracking, or return thunks.
+ */
+
+ switch (mode) {
+ case SPECTRE_V2_NONE:
+ break;
+
+ case SPECTRE_V2_EIBRS:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
+ }
+ break;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_IBRS:
+ pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n");
+ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+ setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+
+ default:
+ pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n");
+ dump_stack();
+ break;
+ }
+}
+
+/*
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
+ */
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_AUTO,
+ BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (!boot_cpu_has(X86_BUG_BHI))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+
+ if (bhi_mitigation != BHI_MITIGATION_AUTO)
+ return;
+
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) {
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ } else {
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ }
+}
+
+static void __init bhi_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE)
+ bhi_mitigation = BHI_MITIGATION_OFF;
+}
+
+static void __init bhi_apply_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+ return;
+ }
+
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT);
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
+ pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE ||
+ spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) &&
+ !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+ pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
+ pr_err("IBRS selected but not compiled in. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
+ pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
+ pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n");
+ spectre_v2_cmd = SPECTRE_V2_CMD_AUTO;
+ }
+
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) {
+ spectre_v2_cmd = SPECTRE_V2_CMD_NONE;
+ return;
+ }
+
+ switch (spectre_v2_cmd) {
+ case SPECTRE_V2_CMD_NONE:
+ return;
+
+ case SPECTRE_V2_CMD_AUTO:
+ if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2))
+ break;
+ fallthrough;
+ case SPECTRE_V2_CMD_FORCE:
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+ }
+
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_LFENCE:
+ pr_err(SPECTRE_V2_LFENCE_MSG);
+ spectre_v2_enabled = SPECTRE_V2_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+ spectre_v2_enabled = SPECTRE_V2_RETPOLINE;
+ break;
+
+ case SPECTRE_V2_CMD_RETPOLINE:
+ spectre_v2_enabled = spectre_v2_select_retpoline();
+ break;
+
+ case SPECTRE_V2_CMD_IBRS:
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_LFENCE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE;
+ break;
+
+ case SPECTRE_V2_CMD_EIBRS_RETPOLINE:
+ spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE;
+ break;
+ }
+}
+
+static void __init spectre_v2_update_mitigation(void)
+{
+ if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO &&
+ !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
+ boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ retbleed_mitigation != RETBLEED_MITIGATION_NONE &&
+ retbleed_mitigation != RETBLEED_MITIGATION_STUFF &&
+ boot_cpu_has(X86_FEATURE_IBRS) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+ spectre_v2_enabled = SPECTRE_V2_IBRS;
+ }
+ }
+
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+
+static void __init spectre_v2_apply_mitigation(void)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
+
+ if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+
+ switch (spectre_v2_enabled) {
+ case SPECTRE_V2_NONE:
+ return;
+
+ case SPECTRE_V2_EIBRS:
+ break;
+
+ case SPECTRE_V2_IBRS:
+ setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
+ if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
+ pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
+ break;
+
+ case SPECTRE_V2_LFENCE:
+ case SPECTRE_V2_EIBRS_LFENCE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
+ fallthrough;
+
+ case SPECTRE_V2_RETPOLINE:
+ case SPECTRE_V2_EIBRS_RETPOLINE:
+ setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+ break;
+ }
+
+ /*
+ * Disable alternate RSB predictions in kernel when indirect CALLs and
+ * JMPs gets protection against BHI and Intramode-BTI, but RET
+ * prediction from a non-RSB predictor is still a risk.
+ */
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE ||
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE ||
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE)
+ spec_ctrl_disable_kernel_rrsba();
+
+ spectre_v2_select_rsb_mitigation(spectre_v2_enabled);
+
+ /*
+ * Retpoline protects the kernel, but doesn't protect firmware. IBRS
+ * and Enhanced IBRS protect firmware too, so enable IBRS around
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
+ *
+ * Use "spectre_v2_enabled" to check Enhanced IBRS instead of
+ * boot_cpu_has(), because the user might select retpoline on the kernel
+ * command line and if the CPU supports Enhanced IBRS, kernel might
+ * un-intentionally not enable IBRS around firmware calls.
+ */
+ if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ boot_cpu_has(X86_FEATURE_IBPB) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
+
+ if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW);
+ pr_info("Enabling Speculation Barrier for firmware calls\n");
+ }
+
+ } else if (boot_cpu_has(X86_FEATURE_IBRS) &&
+ !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) {
+ setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+ pr_info("Enabling Restricted Speculation for firmware calls\n");
+ }
+}
+
+static void update_stibp_msr(void * __unused)
+{
+ u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
+ update_spec_ctrl(val);
+}
+
+/* Update x86_spec_ctrl_base in case SMT state changed. */
+static void update_stibp_strict(void)
+{
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
+
+ if (sched_smt_active())
+ mask |= SPEC_CTRL_STIBP;
+
+ if (mask == x86_spec_ctrl_base)
+ return;
+
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
+ x86_spec_ctrl_base = mask;
+ on_each_cpu(update_stibp_msr, NULL, 1);
+}
+
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
+static void update_indir_branch_cond(void)
+{
+ if (sched_smt_active())
+ static_branch_enable(&switch_to_cond_stibp);
+ else
+ static_branch_disable(&switch_to_cond_stibp);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+/* Update the static key controlling the MDS CPU buffer clear in idle */
+static void update_mds_branch_idle(void)
+{
+ /*
+ * Enable the idle clearing if SMT is active on CPUs which are
+ * affected only by MSBDS and not any other MDS variant.
+ *
+ * The other variants cannot be mitigated when SMT is enabled, so
+ * clearing the buffers on idle just to prevent the Store Buffer
+ * repartitioning leak would be a window dressing exercise.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
+ return;
+
+ if (sched_smt_active()) {
+ static_branch_enable(&cpu_buf_idle_clear);
+ } else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
+ static_branch_disable(&cpu_buf_idle_clear);
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Store Bypass: " fmt
+
+static enum ssb_mitigation ssb_mode __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE;
+
+static const char * const ssb_strings[] = {
+ [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
+ [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
+ [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
+ [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp",
+};
+
+static bool nossb __ro_after_init;
+
+static int __init nossb_parse_cmdline(char *str)
+{
+ nossb = true;
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return 0;
+}
+early_param("nospec_store_bypass_disable", nossb_parse_cmdline);
+
+static int __init ssb_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (nossb)
+ return 0;
+
+ if (!strcmp(str, "auto"))
+ ssb_mode = SPEC_STORE_BYPASS_AUTO;
+ else if (!strcmp(str, "on"))
+ ssb_mode = SPEC_STORE_BYPASS_DISABLE;
+ else if (!strcmp(str, "off"))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ else if (!strcmp(str, "prctl"))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else if (!strcmp(str, "seccomp"))
+ ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ?
+ SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL;
+ else
+ pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n",
+ str);
+
+ return 0;
+}
+early_param("spec_store_bypass_disable", ssb_parse_cmdline);
+
+static void __init ssb_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) {
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ return;
+ }
+
+ if (ssb_mode == SPEC_STORE_BYPASS_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS))
+ ssb_mode = SPEC_STORE_BYPASS_PRCTL;
+ else
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_SSBD))
+ ssb_mode = SPEC_STORE_BYPASS_NONE;
+
+ pr_info("%s\n", ssb_strings[ssb_mode]);
+}
+
+static void __init ssb_apply_mitigation(void)
+{
+ /*
+ * We have three CPU feature flags that are in play here:
+ * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
+ * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass
+ * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation
+ */
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) {
+ setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
+ /*
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
+ * use a completely different MSR and bit dependent on family.
+ */
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
+ x86_amd_ssb_disable();
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculation prctl: " fmt
+
+static void task_update_spec_tif(struct task_struct *tsk)
+{
+ /* Force the update of the real TIF bits */
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
+
+ /*
+ * Immediately update the speculation control MSRs for the current
+ * task, but for a non-current task delay setting the CPU
+ * mitigation until it is scheduled next.
+ *
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
+ * always the current task.
+ */
+ if (tsk == current)
+ speculation_ctrl_update_current();
+}
+
+static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return -EPERM;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ case PR_SPEC_DISABLE:
+ clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH);
+ return 0;
+ default:
+ return -ERANGE;
+ }
+}
+
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
+ ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
+ return -ENXIO;
+
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ /* If speculation is force disabled, enable is not allowed */
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_clear_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_FORCE_DISABLE:
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_force_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+static bool is_spec_ib_user_controlled(void)
+{
+ return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP;
+}
+
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_SPEC_ENABLE:
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return 0;
+
+ /*
+ * With strict mode for both IBPB and STIBP, the instruction
+ * code paths avoid checking this task flag and instead,
+ * unconditionally run the instruction. However, STIBP and IBPB
+ * are independent and either can be set to conditionally
+ * enabled regardless of the mode of the other.
+ *
+ * If either is set to conditional, allow the task flag to be
+ * updated, unless it was force-disabled by a previous prctl
+ * call. Currently, this is possible on an AMD CPU which has the
+ * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the
+ * kernel is booted with 'spectre_v2_user=seccomp', then
+ * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and
+ * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED.
+ */
+ if (!is_spec_ib_user_controlled() ||
+ task_spec_ib_force_disable(task))
+ return -EPERM;
+
+ task_clear_spec_ib_disable(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE:
+ case PR_SPEC_FORCE_DISABLE:
+ /*
+ * Indirect branch speculation is always allowed when
+ * mitigation is force disabled.
+ */
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return -EPERM;
+
+ if (!is_spec_ib_user_controlled())
+ return 0;
+
+ task_set_spec_ib_disable(task);
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
+ task_set_spec_ib_force_disable(task);
+ task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
+ break;
+ default:
+ return -ERANGE;
+ }
+ return 0;
+}
+
+int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
+ unsigned long ctrl)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_set(task, ctrl);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_set(task, ctrl);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_set(task, ctrl);
+ default:
+ return -ENODEV;
+ }
+}
+
+#ifdef CONFIG_SECCOMP
+void arch_seccomp_spec_mitigate(struct task_struct *task)
+{
+ if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
+ ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP)
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
+}
+#endif
+
+static int l1d_flush_prctl_get(struct task_struct *task)
+{
+ if (!static_branch_unlikely(&switch_mm_cond_l1d_flush))
+ return PR_SPEC_FORCE_DISABLE;
+
+ if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH))
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ else
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+}
+
+static int ssb_prctl_get(struct task_struct *task)
+{
+ switch (ssb_mode) {
+ case SPEC_STORE_BYPASS_NONE:
+ if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
+ return PR_SPEC_ENABLE;
+ return PR_SPEC_NOT_AFFECTED;
+ case SPEC_STORE_BYPASS_DISABLE:
+ return PR_SPEC_DISABLE;
+ case SPEC_STORE_BYPASS_SECCOMP:
+ case SPEC_STORE_BYPASS_PRCTL:
+ case SPEC_STORE_BYPASS_AUTO:
+ if (task_spec_ssb_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
+ if (task_spec_ssb_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ }
+ BUG();
+}
+
+static int ib_prctl_get(struct task_struct *task)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+ return PR_SPEC_NOT_AFFECTED;
+
+ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE &&
+ spectre_v2_user_stibp == SPECTRE_V2_USER_NONE)
+ return PR_SPEC_ENABLE;
+ else if (is_spec_ib_user_controlled()) {
+ if (task_spec_ib_force_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ib_disable(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
+ } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED)
+ return PR_SPEC_DISABLE;
+ else
+ return PR_SPEC_NOT_AFFECTED;
+}
+
+int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
+{
+ switch (which) {
+ case PR_SPEC_STORE_BYPASS:
+ return ssb_prctl_get(task);
+ case PR_SPEC_INDIRECT_BRANCH:
+ return ib_prctl_get(task);
+ case PR_SPEC_L1D_FLUSH:
+ return l1d_flush_prctl_get(task);
+ default:
+ return -ENODEV;
+ }
+}
+
+void x86_spec_ctrl_setup_ap(void)
+{
+ if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ update_spec_ctrl(x86_spec_ctrl_base);
+
+ if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
+ x86_amd_ssb_disable();
+}
+
+bool itlb_multihit_kvm_mitigation;
+EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "L1TF: " fmt
+
+/* Default mitigation for L1TF-affected CPUs */
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF;
+EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation);
+enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
+EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation);
+
+/*
+ * These CPUs all support 44bits physical address space internally in the
+ * cache but CPUID can report a smaller number of physical address bits.
+ *
+ * The L1TF mitigation uses the top most address bit for the inversion of
+ * non present PTEs. When the installed memory reaches into the top most
+ * address bit due to memory holes, which has been observed on machines
+ * which report 36bits physical address bits and have 32G RAM installed,
+ * then the mitigation range check in l1tf_select_mitigation() triggers.
+ * This is a false positive because the mitigation is still possible due to
+ * the fact that the cache uses 44bit internally. Use the cache bits
+ * instead of the reported physical bits and adjust them on the affected
+ * machines to 44bit if the reported bits are less than 44.
+ */
+static void override_cache_bits(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 6)
+ return;
+
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
+ if (c->x86_cache_bits < 44)
+ c->x86_cache_bits = 44;
+ break;
+ }
+}
+
+static void __init l1tf_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (l1tf_mitigation != L1TF_MITIGATION_AUTO)
+ return;
+
+ if (!should_mitigate_vuln(X86_BUG_L1TF)) {
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ return;
+ }
+
+ if (smt_mitigations == SMT_MITIGATIONS_ON)
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+}
+
+static void __init l1tf_apply_mitigation(void)
+{
+ u64 half_pa;
+
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return;
+
+ override_cache_bits(&boot_cpu_data);
+
+ switch (l1tf_mitigation) {
+ case L1TF_MITIGATION_OFF:
+ case L1TF_MITIGATION_FLUSH_NOWARN:
+ case L1TF_MITIGATION_FLUSH:
+ case L1TF_MITIGATION_AUTO:
+ break;
+ case L1TF_MITIGATION_FLUSH_NOSMT:
+ case L1TF_MITIGATION_FULL:
+ cpu_smt_disable(false);
+ break;
+ case L1TF_MITIGATION_FULL_FORCE:
+ cpu_smt_disable(true);
+ break;
+ }
+
+#if CONFIG_PGTABLE_LEVELS == 2
+ pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
+ return;
+#endif
+
+ half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
+ e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) {
+ pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
+ pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
+ half_pa);
+ pr_info("However, doing so will make a part of your RAM unusable.\n");
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
+ return;
+ }
+
+ setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
+}
+
+static int __init l1tf_cmdline(char *str)
+{
+ if (!boot_cpu_has_bug(X86_BUG_L1TF))
+ return 0;
+
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
+ else if (!strcmp(str, "flush,nowarn"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
+ else if (!strcmp(str, "flush"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH;
+ else if (!strcmp(str, "flush,nosmt"))
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
+ else if (!strcmp(str, "full"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL;
+ else if (!strcmp(str, "full,force"))
+ l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
+
+ return 0;
+}
+early_param("l1tf", l1tf_cmdline);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt
+
+static const char * const srso_strings[] = {
+ [SRSO_MITIGATION_NONE] = "Vulnerable",
+ [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+ [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode",
+ [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET",
+ [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled",
+ [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET",
+ [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB",
+ [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only",
+ [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation"
+};
+
+static int __init srso_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ else if (!strcmp(str, "microcode"))
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ else if (!strcmp(str, "safe-ret"))
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ else if (!strcmp(str, "ibpb"))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+ else if (!strcmp(str, "ibpb-vmexit"))
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ else
+ pr_err("Ignoring unknown SRSO option (%s).", str);
+
+ return 0;
+}
+early_param("spec_rstack_overflow", srso_parse_cmdline);
+
+#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options."
+
+static void __init srso_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+
+ if (srso_mitigation == SRSO_MITIGATION_AUTO) {
+ /*
+ * Use safe-RET if user->kernel or guest->host protection is
+ * required. Otherwise the 'microcode' mitigation is sufficient
+ * to protect the user->user and guest->guest vectors.
+ */
+ if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) ||
+ (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) &&
+ !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) {
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET;
+ } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) ||
+ cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) {
+ srso_mitigation = SRSO_MITIGATION_MICROCODE;
+ } else {
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ return;
+ }
+ }
+
+ /* Zen1/2 with SMT off aren't vulnerable to SRSO. */
+ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) {
+ srso_mitigation = SRSO_MITIGATION_NOSMT;
+ return;
+ }
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) {
+ pr_warn("IBPB-extending microcode not applied!\n");
+ pr_warn(SRSO_NOTICE);
+
+ /*
+ * Safe-RET provides partial mitigation without microcode, but
+ * other mitigations require microcode to provide any
+ * mitigations.
+ */
+ if (srso_mitigation == SRSO_MITIGATION_SAFE_RET)
+ srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
+ else
+ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) {
+ srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+ goto ibpb_on_vmexit;
+ }
+
+ if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ibpb_on_vmexit:
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) {
+ pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n");
+ srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE;
+ break;
+ }
+ fallthrough;
+ case SRSO_MITIGATION_IBPB:
+ if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ srso_mitigation = SRSO_MITIGATION_NONE;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void __init srso_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_SRSO))
+ return;
+
+ /* If retbleed is using IBPB, that works for SRSO as well */
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB &&
+ boot_cpu_has(X86_FEATURE_IBPB_BRTYPE))
+ srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ pr_info("%s\n", srso_strings[srso_mitigation]);
+}
+
+static void __init srso_apply_mitigation(void)
+{
/*
- * kernel_fpu_begin/end() in check_fpu() relies on the patched
- * alternative instructions.
+ * Clear the feature flag if this mitigation is not selected as that
+ * feature flag controls the BpSpecReduce MSR bit toggling in KVM.
*/
- if (cpu_has_fpu)
- check_fpu();
+ if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE)
+ setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE);
+
+ if (srso_mitigation == SRSO_MITIGATION_NONE) {
+ if (boot_cpu_has(X86_FEATURE_SBPB))
+ x86_pred_cmd = PRED_CMD_SBPB;
+ return;
+ }
+
+ switch (srso_mitigation) {
+ case SRSO_MITIGATION_SAFE_RET:
+ case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED:
+ /*
+ * Enable the return thunk for generated code
+ * like ftrace, static_call, etc.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_UNRET);
+
+ if (boot_cpu_data.x86 == 0x19) {
+ setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS);
+ set_return_thunk(srso_alias_return_thunk);
+ } else {
+ setup_force_cpu_cap(X86_FEATURE_SRSO);
+ set_return_thunk(srso_return_thunk);
+ }
+ break;
+ case SRSO_MITIGATION_IBPB:
+ setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+ fallthrough;
+ case SRSO_MITIGATION_IBPB_ON_VMEXIT:
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+ break;
+ default:
+ break;
+ }
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) "VMSCAPE: " fmt
+
+enum vmscape_mitigations {
+ VMSCAPE_MITIGATION_NONE,
+ VMSCAPE_MITIGATION_AUTO,
+ VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER,
+ VMSCAPE_MITIGATION_IBPB_ON_VMEXIT,
+};
+
+static const char * const vmscape_strings[] = {
+ [VMSCAPE_MITIGATION_NONE] = "Vulnerable",
+ /* [VMSCAPE_MITIGATION_AUTO] */
+ [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace",
+ [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT",
+};
+
+static enum vmscape_mitigations vmscape_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE;
+
+static int __init vmscape_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ } else if (!strcmp(str, "ibpb")) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+ vmscape_mitigation = VMSCAPE_MITIGATION_AUTO;
+ } else {
+ pr_err("Ignoring unknown vmscape=%s option.\n", str);
+ }
+
+ return 0;
+}
+early_param("vmscape", vmscape_parse_cmdline);
+
+static void __init vmscape_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) ||
+ !boot_cpu_has(X86_FEATURE_IBPB)) {
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ return;
+ }
+
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) {
+ if (should_mitigate_vuln(X86_BUG_VMSCAPE))
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER;
+ else
+ vmscape_mitigation = VMSCAPE_MITIGATION_NONE;
+ }
+}
+
+static void __init vmscape_update_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE))
+ return;
+
+ if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB ||
+ srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT)
+ vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT;
+
+ pr_info("%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static void __init vmscape_apply_mitigation(void)
+{
+ if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER)
+ setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER);
+}
+
+#undef pr_fmt
+#define pr_fmt(fmt) fmt
+
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
+#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n"
+#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n"
+#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n"
+
+void cpu_bugs_smt_update(void)
+{
+ mutex_lock(&spec_ctrl_mutex);
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG);
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ break;
+ case SPECTRE_V2_USER_STRICT:
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ update_stibp_strict();
+ break;
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ update_indir_branch_cond();
+ break;
+ }
+
+ switch (mds_mitigation) {
+ case MDS_MITIGATION_FULL:
+ case MDS_MITIGATION_AUTO:
+ case MDS_MITIGATION_VMWERV:
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
+ pr_warn_once(MDS_MSG_SMT);
+ update_mds_branch_idle();
+ break;
+ case MDS_MITIGATION_OFF:
+ break;
+ }
+
+ switch (taa_mitigation) {
+ case TAA_MITIGATION_VERW:
+ case TAA_MITIGATION_AUTO:
+ case TAA_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(TAA_MSG_SMT);
+ break;
+ case TAA_MITIGATION_TSX_DISABLED:
+ case TAA_MITIGATION_OFF:
+ break;
+ }
+
+ switch (mmio_mitigation) {
+ case MMIO_MITIGATION_VERW:
+ case MMIO_MITIGATION_AUTO:
+ case MMIO_MITIGATION_UCODE_NEEDED:
+ if (sched_smt_active())
+ pr_warn_once(MMIO_MSG_SMT);
+ break;
+ case MMIO_MITIGATION_OFF:
+ break;
+ }
+
+ switch (tsa_mitigation) {
+ case TSA_MITIGATION_USER_KERNEL:
+ case TSA_MITIGATION_VM:
+ case TSA_MITIGATION_AUTO:
+ case TSA_MITIGATION_FULL:
+ /*
+ * TSA-SQ can potentially lead to info leakage between
+ * SMT threads.
+ */
+ if (sched_smt_active())
+ static_branch_enable(&cpu_buf_idle_clear);
+ else
+ static_branch_disable(&cpu_buf_idle_clear);
+ break;
+ case TSA_MITIGATION_NONE:
+ case TSA_MITIGATION_UCODE_NEEDED:
+ break;
+ }
+
+ switch (vmscape_mitigation) {
+ case VMSCAPE_MITIGATION_NONE:
+ case VMSCAPE_MITIGATION_AUTO:
+ break;
+ case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT:
+ case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER:
+ /*
+ * Hypervisors can be attacked across-threads, warn for SMT when
+ * STIBP is not already enabled system-wide.
+ *
+ * Intel eIBRS (!AUTOIBRS) implies STIBP on.
+ */
+ if (!sched_smt_active() ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ||
+ (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS)))
+ break;
+ pr_warn_once(VMSCAPE_MSG_SMT);
+ break;
+ }
+
+ mutex_unlock(&spec_ctrl_mutex);
+}
+
+void __init cpu_select_mitigations(void)
+{
+ /*
+ * Read the SPEC_CTRL MSR to account for reserved bits which may
+ * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
+ * init code as it is not enumerated and depends on the family.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
+ rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ cpu_print_attack_vectors();
+
+ /* Select the proper CPU mitigations before patching alternatives: */
+ spectre_v1_select_mitigation();
+ spectre_v2_select_mitigation();
+ retbleed_select_mitigation();
+ spectre_v2_user_select_mitigation();
+ ssb_select_mitigation();
+ l1tf_select_mitigation();
+ mds_select_mitigation();
+ taa_select_mitigation();
+ mmio_select_mitigation();
+ rfds_select_mitigation();
+ srbds_select_mitigation();
+ l1d_flush_select_mitigation();
+ srso_select_mitigation();
+ gds_select_mitigation();
+ its_select_mitigation();
+ bhi_select_mitigation();
+ tsa_select_mitigation();
+ vmscape_select_mitigation();
+
+ /*
+ * After mitigations are selected, some may need to update their
+ * choices.
+ */
+ spectre_v2_update_mitigation();
+ /*
+ * retbleed_update_mitigation() relies on the state set by
+ * spectre_v2_update_mitigation(); specifically it wants to know about
+ * spectre_v2=ibrs.
+ */
+ retbleed_update_mitigation();
+ /*
+ * its_update_mitigation() depends on spectre_v2_update_mitigation()
+ * and retbleed_update_mitigation().
+ */
+ its_update_mitigation();
+
+ /*
+ * spectre_v2_user_update_mitigation() depends on
+ * retbleed_update_mitigation(), specifically the STIBP
+ * selection is forced for UNRET or IBPB.
+ */
+ spectre_v2_user_update_mitigation();
+ mds_update_mitigation();
+ taa_update_mitigation();
+ mmio_update_mitigation();
+ rfds_update_mitigation();
+ bhi_update_mitigation();
+ /* srso_update_mitigation() depends on retbleed_update_mitigation(). */
+ srso_update_mitigation();
+ vmscape_update_mitigation();
+
+ spectre_v1_apply_mitigation();
+ spectre_v2_apply_mitigation();
+ retbleed_apply_mitigation();
+ spectre_v2_user_apply_mitigation();
+ ssb_apply_mitigation();
+ l1tf_apply_mitigation();
+ mds_apply_mitigation();
+ taa_apply_mitigation();
+ mmio_apply_mitigation();
+ rfds_apply_mitigation();
+ srbds_apply_mitigation();
+ srso_apply_mitigation();
+ gds_apply_mitigation();
+ its_apply_mitigation();
+ bhi_apply_mitigation();
+ tsa_apply_mitigation();
+ vmscape_apply_mitigation();
+}
+
+#ifdef CONFIG_SYSFS
+
+#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+static const char * const l1tf_vmx_states[] = {
+ [VMENTER_L1D_FLUSH_AUTO] = "auto",
+ [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
+ [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
+ [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
+ [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
+ [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
+};
+
+static ssize_t l1tf_show_state(char *buf)
+{
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+
+ if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
+ (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
+ sched_smt_active())) {
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
+ else if (!(cr4_read_shadow() & X86_CR4_VMXE))
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
+ else if (itlb_multihit_kvm_mitigation)
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
+ else
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
+}
+#else
+static ssize_t l1tf_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
+}
+
+static ssize_t itlb_multihit_show_state(char *buf)
+{
+ return sysfs_emit(buf, "Processor vulnerable\n");
+}
+#endif
+
+static ssize_t mds_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
+ }
+
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t tsx_async_abort_show_state(char *buf)
+{
+ if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
+ (taa_mitigation == TAA_MITIGATION_OFF))
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t mmio_stale_data_show_state(char *buf)
+{
+ if (mmio_mitigation == MMIO_MITIGATION_OFF)
+ return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]);
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mmio_strings[mmio_mitigation]);
+ }
+
+ return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
+}
+
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
+static ssize_t old_microcode_show_state(char *buf)
+{
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return sysfs_emit(buf, "Unknown: running under hypervisor");
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+static ssize_t its_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]);
+}
+
+static char *stibp_state(void)
+{
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ !boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ return "";
+
+ switch (spectre_v2_user_stibp) {
+ case SPECTRE_V2_USER_NONE:
+ return "; STIBP: disabled";
+ case SPECTRE_V2_USER_STRICT:
+ return "; STIBP: forced";
+ case SPECTRE_V2_USER_STRICT_PREFERRED:
+ return "; STIBP: always-on";
+ case SPECTRE_V2_USER_PRCTL:
+ case SPECTRE_V2_USER_SECCOMP:
+ if (static_key_enabled(&switch_to_cond_stibp))
+ return "; STIBP: conditional";
+ }
+ return "";
+}
+
+static char *ibpb_state(void)
+{
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
+ if (static_key_enabled(&switch_mm_always_ibpb))
+ return "; IBPB: always-on";
+ if (static_key_enabled(&switch_mm_cond_ibpb))
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
+ }
+ return "";
+}
+
+static char *pbrsb_eibrs_state(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
+ if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
+ boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
+ return "; PBRSB-eIBRS: SW sequence";
+ else
+ return "; PBRSB-eIBRS: Vulnerable";
+ } else {
+ return "; PBRSB-eIBRS: Not affected";
+ }
+}
+
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
+static ssize_t spectre_v2_show_state(char *buf)
+{
+ if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+
+ if (sched_smt_active() && unprivileged_ebpf_enabled() &&
+ spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
+ spectre_v2_module_string());
+}
+
+static ssize_t srbds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
+}
+
+static ssize_t retbleed_show_state(char *buf)
+{
+ if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+ retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
+
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
+ }
+
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+}
+
+static ssize_t srso_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]);
+}
+
+static ssize_t gds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]);
+}
+
+static ssize_t tsa_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]);
+}
+
+static ssize_t vmscape_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]);
+}
+
+static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
+ char *buf, unsigned int bug)
+{
+ if (!boot_cpu_has_bug(bug))
+ return sysfs_emit(buf, "Not affected\n");
+
+ switch (bug) {
+ case X86_BUG_CPU_MELTDOWN:
+ if (boot_cpu_has(X86_FEATURE_PTI))
+ return sysfs_emit(buf, "Mitigation: PTI\n");
+
+ if (hypervisor_is_type(X86_HYPER_XEN_PV))
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+
+ break;
+
+ case X86_BUG_SPECTRE_V1:
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+
+ case X86_BUG_SPECTRE_V2:
+ return spectre_v2_show_state(buf);
+
+ case X86_BUG_SPEC_STORE_BYPASS:
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
+
+ case X86_BUG_L1TF:
+ if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
+ return l1tf_show_state(buf);
+ break;
+
+ case X86_BUG_MDS:
+ return mds_show_state(buf);
+
+ case X86_BUG_TAA:
+ return tsx_async_abort_show_state(buf);
+
+ case X86_BUG_ITLB_MULTIHIT:
+ return itlb_multihit_show_state(buf);
+
+ case X86_BUG_SRBDS:
+ return srbds_show_state(buf);
+
+ case X86_BUG_MMIO_STALE_DATA:
+ return mmio_stale_data_show_state(buf);
+
+ case X86_BUG_RETBLEED:
+ return retbleed_show_state(buf);
+
+ case X86_BUG_SRSO:
+ return srso_show_state(buf);
+
+ case X86_BUG_GDS:
+ return gds_show_state(buf);
+
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
+ case X86_BUG_OLD_MICROCODE:
+ return old_microcode_show_state(buf);
+
+ case X86_BUG_ITS:
+ return its_show_state(buf);
+
+ case X86_BUG_TSA:
+ return tsa_show_state(buf);
+
+ case X86_BUG_VMSCAPE:
+ return vmscape_show_state(buf);
+
+ default:
+ break;
+ }
+
+ return sysfs_emit(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN);
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1);
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2);
+}
+
+ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
+}
+
+ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
+}
+
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
+}
+
+ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TAA);
+}
+
+ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
+}
+
+ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
+}
+
+ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
+}
+
+ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
+}
+
+ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_SRSO);
+}
+
+ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
+}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
+
+ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE);
+}
+
+ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_ITS);
+}
+
+ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_TSA);
+}
+
+ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE);
+}
+#endif
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index 04f0fe5af83e..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- printk(KERN_INFO "CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..dbc99a47be45
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <linux/kvm_types.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrq_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrq(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrq(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+/*
+ * In order for each CPU to schedule its delayed work independently of the
+ * others, delayed work struct must be per-CPU. This is not required when
+ * sysctl_sld_mitigate is enabled because of the semaphore that limits
+ * the number of simultaneously scheduled delayed works to 1.
+ */
+static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+
+/*
+ * Per-CPU delayed_work can't be statically initialized properly because
+ * the struct address is unknown. Thus per-CPU delayed_work structures
+ * have to be initialized during kernel initialization and after calling
+ * setup_per_cpu_areas().
+ */
+static int __init setup_split_lock_delayed_work(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu);
+
+ INIT_DELAYED_WORK(work, __split_lock_reenable);
+ }
+
+ return 0;
+}
+pure_initcall(setup_split_lock_delayed_work);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+ unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate);
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (saved_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ }
+
+ cpu = get_cpu();
+ work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu);
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrq(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrq(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
new file mode 100644
index 000000000000..51a95b07831f
--- /dev/null
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -0,0 +1,820 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * x86 CPU caches detection and configuration
+ *
+ * Previous changes
+ * - Venkatesh Pallipadi: Cache identification through CPUID(0x4)
+ * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure
+ * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD
+ */
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
+#include <linux/stop_machine.h>
+
+#include <asm/amd/nb.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/cpuid/api.h>
+#include <asm/mtrr.h>
+#include <asm/smp.h>
+#include <asm/tlbflush.h>
+
+#include "cpu.h"
+
+/* Shared last level cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+
+/* Shared L2 cache maps */
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+
+static cpumask_var_t cpu_cacheinfo_mask;
+
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
+
+enum _cache_type {
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
+};
+
+union _cpuid4_leaf_eax {
+ struct {
+ enum _cache_type type :5;
+ unsigned int level :3;
+ unsigned int is_self_initializing :1;
+ unsigned int is_fully_associative :1;
+ unsigned int reserved :4;
+ unsigned int num_threads_sharing :12;
+ unsigned int num_cores_on_die :6;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ebx {
+ struct {
+ unsigned int coherency_line_size :12;
+ unsigned int physical_line_partition :10;
+ unsigned int ways_of_associativity :10;
+ } split;
+ u32 full;
+};
+
+union _cpuid4_leaf_ecx {
+ struct {
+ unsigned int number_of_sets :32;
+ } split;
+ u32 full;
+};
+
+struct _cpuid4_info {
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
+ unsigned long size;
+};
+
+/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
+/*
+ * Fallback AMD CPUID(0x4) emulation
+ * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d)
+ *
+ * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should
+ * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006).
+ */
+
+#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff
+#define AMD_L2_L3_INVALID_ASSOC 0x9
+
+union l1_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :8;
+ unsigned assoc :8;
+ unsigned size_in_kb :8;
+ };
+ unsigned int val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned size_in_kb :16;
+ };
+ unsigned int val;
+};
+
+union l3_cache {
+ struct {
+ unsigned line_size :8;
+ unsigned lines_per_tag :4;
+ unsigned assoc :4;
+ unsigned res :2;
+ unsigned size_encoded :14;
+ };
+ unsigned int val;
+};
+
+/* L2/L3 associativity mapping */
+static const unsigned short assocs[] = {
+ [1] = 1,
+ [2] = 2,
+ [3] = 3,
+ [4] = 4,
+ [5] = 6,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
+ [0xc] = 64,
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE
+};
+
+static const unsigned char levels[] = { 1, 1, 2, 3 };
+static const unsigned char types[] = { 1, 2, 3, 3 };
+
+static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d, *l1;
+ union l2_cache l2;
+ union l3_cache l3;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
+
+ l1 = &l1d;
+ switch (index) {
+ case 1:
+ l1 = &l1i;
+ fallthrough;
+ case 0:
+ if (!l1->val)
+ return;
+
+ assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ break;
+ case 2:
+ if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ /* Use x86_cache_size as it might have K7 errata fixes */
+ assoc = assocs[l2.assoc];
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
+ break;
+ case 3:
+ if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC)
+ return;
+
+ assoc = assocs[l3.assoc];
+ line_size = l3.line_size;
+ lines_per_tag = l3.lines_per_tag;
+ size_in_kb = l3.size_encoded * 512;
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
+ size_in_kb = size_in_kb >> 1;
+ assoc = assoc >> 1;
+ }
+ break;
+ default:
+ return;
+ }
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[index];
+ eax->split.level = levels[index];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
+
+ if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE)
+ eax->split.is_fully_associative = 1;
+
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assoc - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
+
+static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax,
+ union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx)
+{
+ if (eax.split.type == CTYPE_NULL)
+ return -EIO;
+
+ id4->eax = eax;
+ id4->ebx = ebx;
+ id4->ecx = ecx;
+ id4->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
+
+ return 0;
+}
+
+static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+ else
+ legacy_amd_cpuid4(index, &eax, &ebx, &ecx);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ u32 ignored;
+
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored);
+
+ return cpuid4_info_fill_done(id4, eax, ebx, ecx);
+}
+
+static int fill_cpuid4_info(int index, struct _cpuid4_info *id4)
+{
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+
+ return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ?
+ amd_fill_cpuid4_info(index, id4) :
+ intel_fill_cpuid4_info(index, id4);
+}
+
+static int find_num_cache_leaves(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, ebx, ecx, edx, op;
+ union _cpuid4_leaf_eax cache_eax;
+ int i = -1;
+
+ /* Do a CPUID(op) loop to calculate num_cache_leaves */
+ op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4;
+ do {
+ ++i;
+ cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
+ cache_eax.full = eax;
+ } while (cache_eax.split.type != CTYPE_NULL);
+ return i;
+}
+
+/*
+ * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4)
+{
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+
+ return apicid >> index_msb;
+}
+
+/*
+ * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist.
+ */
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ if (c->x86 < 0x17) {
+ /* Pre-Zen: LLC is at the node level */
+ c->topo.llc_id = die_id;
+ } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
+ /*
+ * Family 17h up to 1F models: LLC is at the core
+ * complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+ } else {
+ /*
+ * Newer families: LLC ID is calculated from the number
+ * of threads sharing the L3 cache.
+ */
+ u32 llc_index = find_num_cache_leaves(c) - 1;
+ struct _cpuid4_info id4 = {};
+
+ if (!amd_fill_cpuid4_info(llc_index, &id4))
+ c->topo.llc_id = get_cache_id(c->topo.apicid, &id4);
+ }
+}
+
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
+{
+ if (!cpuid_amd_hygon_has_l3_cache())
+ return;
+
+ /*
+ * Hygons are similar to AMD Family 17h up to 1F models: LLC is
+ * at the core complex level. Core complex ID is ApicId[3].
+ */
+ c->topo.llc_id = c->topo.apicid >> 3;
+}
+
+void init_amd_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT))
+ ci->num_leaves = find_num_cache_leaves(c);
+ else if (c->extended_cpuid_level >= 0x80000006)
+ ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3;
+}
+
+void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
+}
+
+static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3,
+ unsigned int l2, unsigned int l1i, unsigned int l1d)
+{
+ /*
+ * If llc_id is still unset, then cpuid_level < 4, which implies
+ * that the only possibility left is SMT. Since CPUID(0x2) doesn't
+ * specify any shared caches and SMT shares all caches, we can
+ * unconditionally set LLC ID to the package ID so that all
+ * threads share it.
+ */
+ if (c->topo.llc_id == BAD_APICID)
+ c->topo.llc_id = c->topo.pkg_id;
+
+ c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d);
+
+ if (!l2)
+ cpu_detect_cache_sizes(c);
+}
+
+/*
+ * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available.
+ */
+static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c)
+{
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
+
+ if (c->cpuid_level < 2)
+ return;
+
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc) {
+ switch (desc->c_type) {
+ case CACHE_L1_INST: l1i += desc->c_size; break;
+ case CACHE_L1_DATA: l1d += desc->c_size; break;
+ case CACHE_L2: l2 += desc->c_size; break;
+ case CACHE_L3: l3 += desc->c_size; break;
+ }
+ }
+
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+}
+
+static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4)
+{
+ unsigned int num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ return c->topo.apicid & ~((1 << index_msb) - 1);
+}
+
+static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+ unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID;
+ unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0;
+
+ if (c->cpuid_level < 4)
+ return false;
+
+ /*
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been previously initialized.
+ */
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
+
+ if (!ci->num_leaves)
+ return false;
+
+ for (int i = 0; i < ci->num_leaves; i++) {
+ struct _cpuid4_info id4 = {};
+ int ret;
+
+ ret = intel_fill_cpuid4_info(i, &id4);
+ if (ret < 0)
+ continue;
+
+ switch (id4.eax.split.level) {
+ case 1:
+ if (id4.eax.split.type == CTYPE_DATA)
+ l1d = id4.size / 1024;
+ else if (id4.eax.split.type == CTYPE_INST)
+ l1i = id4.size / 1024;
+ break;
+ case 2:
+ l2 = id4.size / 1024;
+ l2_id = calc_cache_topo_id(c, &id4);
+ break;
+ case 3:
+ l3 = id4.size / 1024;
+ l3_id = calc_cache_topo_id(c, &id4);
+ break;
+ default:
+ break;
+ }
+ }
+
+ c->topo.l2c_id = l2_id;
+ c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id;
+ intel_cacheinfo_done(c, l3, l2, l1i, l1d);
+ return true;
+}
+
+void init_intel_cacheinfo(struct cpuinfo_x86 *c)
+{
+ /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */
+ if (intel_cacheinfo_0x4(c))
+ return;
+
+ intel_cacheinfo_0x2(c);
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon
+ */
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci;
+ struct cacheinfo *ci;
+ int i, sibling;
+
+ /*
+ * For L3, always use the pre-calculated cpu_llc_shared_mask
+ * to derive shared_cpu_map.
+ */
+ if (index == 3) {
+ for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
+ if (!cpu_online(sibling))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ unsigned int apicid, nshared, first, last;
+
+ nshared = id4->eax.split.num_threads_sharing + 1;
+ apicid = cpu_data(cpu).topo.apicid;
+ first = apicid - (apicid % nshared);
+ last = first + nshared - 1;
+
+ for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
+ apicid = cpu_data(i).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+
+ ci = this_cpu_ci->info_list + index;
+
+ for_each_online_cpu(sibling) {
+ apicid = cpu_data(sibling).topo.apicid;
+ if ((apicid < first) || (apicid > last))
+ continue;
+ cpumask_set_cpu(sibling, &ci->shared_cpu_map);
+ }
+ }
+ } else
+ return 0;
+
+ return 1;
+}
+
+/*
+ * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon
+ */
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ const struct _cpuid4_info *id4)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct cacheinfo *ci, *sibling_ci;
+ unsigned long num_threads_sharing;
+ int index_msb, i;
+
+ if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
+ if (__cache_amd_cpumap_setup(cpu, index, id4))
+ return;
+ }
+
+ ci = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + id4->eax.split.num_threads_sharing;
+
+ cpumask_set_cpu(cpu, &ci->shared_cpu_map);
+ if (num_threads_sharing == 1)
+ return;
+
+ index_msb = get_count_order(num_threads_sharing);
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
+
+ /* Skip if itself or no cacheinfo */
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;
+
+ sibling_ci = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &ci->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map);
+ }
+}
+
+static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4,
+ struct amd_northbridge *nb)
+{
+ ci->id = id4->id;
+ ci->attributes = CACHE_ID;
+ ci->level = id4->eax.split.level;
+ ci->type = cache_type_map[id4->eax.split.type];
+ ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1;
+ ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1;
+ ci->size = id4->size;
+ ci->number_of_sets = id4->ecx.split.number_of_sets + 1;
+ ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1;
+ ci->priv = nb;
+}
+
+int init_cache_level(unsigned int cpu)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
+ return -ENOENT;
+
+ return 0;
+}
+
+int populate_cache_leaves(unsigned int cpu)
+{
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *ci = this_cpu_ci->info_list;
+ u8 cpu_vendor = boot_cpu_data.x86_vendor;
+ u32 apicid = cpu_data(cpu).topo.apicid;
+ struct amd_northbridge *nb = NULL;
+ struct _cpuid4_info id4 = {};
+ int idx, ret;
+
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = fill_cpuid4_info(idx, &id4);
+ if (ret)
+ return ret;
+
+ id4.id = get_cache_id(apicid, &id4);
+
+ if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON)
+ nb = amd_init_l3_cache(idx);
+
+ ci_info_init(ci++, &id4, nb);
+ __cache_cpumap_setup(cpu, idx, &id4);
+ }
+
+ this_cpu_ci->cpu_map_populated = true;
+ return 0;
+}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+/*
+ * Cache flushing is the most time-consuming step when programming the
+ * MTRRs. On many Intel CPUs without known erratas, it can be skipped
+ * if the CPU declares cache self-snooping support.
+ */
+static void maybe_flush_caches(void)
+{
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * This is not ideal since the cache is only flushed/disabled
+ * for this CPU while the MTRRs are changed, but changing this
+ * requires more invasive changes to the way the kernel boots.
+ */
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ maybe_flush_caches();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ maybe_flush_caches();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
+ mtrr_generic_set_state();
+ cache_enable();
+ }
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_online(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, cpu_cacheinfo_mask);
+
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_cacheinfo_mask);
+
+ return 0;
+}
+
+static int cache_ap_offline(unsigned int cpu)
+{
+ cpumask_clear_cpu(cpu, cpu_cacheinfo_mask);
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ zalloc_cpumask_var(&cpu_cacheinfo_mask, GFP_KERNEL);
+ cpumask_set_cpu(smp_processor_id(), cpu_cacheinfo_mask);
+
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_online, cache_ap_offline);
+ return 0;
+}
+early_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 159103c0b1f4..a3b55db35c96 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,244 +1,16 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <asm/processor.h>
-#include <asm/e820.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
#include "cpu.h"
-#ifdef CONFIG_X86_OOSTORE
-
-static u32 __cpuinit power2(u32 x)
-{
- u32 s = 1;
-
- while (s <= x)
- s <<= 1;
-
- return s >>= 1;
-}
-
-
-/*
- * Set up an actual MCR
- */
-static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
-{
- u32 lo, hi;
-
- hi = base & ~0xFFF;
- lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
- lo &= ~0xFFF; /* Remove the ctrl value bits */
- lo |= key; /* Attribute we wish to set */
- wrmsr(reg+MSR_IDT_MCR0, lo, hi);
- mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
-}
-
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 __cpuinit ramtop(void)
-{
- u32 clip = 0xFFFFFFFFUL;
- u32 top = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
-
- if (e820.map[i].addr > 0xFFFFFFFFUL)
- continue;
- /*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
- */
- if (e820.map[i].type == E820_RESERVED) {
- if (e820.map[i].addr >= 0x100000UL &&
- e820.map[i].addr < clip)
- clip = e820.map[i].addr;
- continue;
- }
- start = e820.map[i].addr;
- end = e820.map[i].addr + e820.map[i].size;
- if (start >= end)
- continue;
- if (end > top)
- top = end;
- }
- /*
- * Everything below 'top' should be RAM except for the ISA hole.
- * Because of the limited MCR's we want to map NV/ACPI into our
- * MCR range for gunk in RAM
- *
- * Clip might cause us to MCR insufficient RAM but that is an
- * acceptable failure mode and should only bite obscure boxes with
- * a VESA hole at 15Mb
- *
- * The second case Clip sometimes kicks in is when the EBDA is marked
- * as reserved. Again we fail safe with reasonable results
- */
- if (top > clip)
- top = clip;
-
- return top;
-}
-
-/*
- * Compute a set of MCR's to give maximum coverage
- */
-static int __cpuinit centaur_mcr_compute(int nr, int key)
-{
- u32 mem = ramtop();
- u32 root = power2(mem);
- u32 base = root;
- u32 top = root;
- u32 floor = 0;
- int ct = 0;
-
- while (ct < nr) {
- u32 fspace = 0;
- u32 high;
- u32 low;
-
- /*
- * Find the largest block we will fill going upwards
- */
- high = power2(mem-top);
-
- /*
- * Find the largest block we will fill going downwards
- */
- low = base/2;
-
- /*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
- if (base <= 1024*1024)
- low = 0;
-
- /*
- * See how much space we could cover by filling below
- * the ISA hole
- */
-
- if (floor == 0)
- fspace = 512*1024;
- else if (floor == 512*1024)
- fspace = 128*1024;
-
- /* And forget ROM space */
-
- /*
- * Now install the largest coverage we get
- */
- if (fspace > high && fspace > low) {
- centaur_mcr_insert(ct, floor, fspace, key);
- floor += fspace;
- } else if (high > low) {
- centaur_mcr_insert(ct, top, high, key);
- top += high;
- } else if (low > 0) {
- base -= low;
- centaur_mcr_insert(ct, base, low, key);
- } else
- break;
- ct++;
- }
- /*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
- */
- return ct;
-}
-
-static void __cpuinit centaur_create_optimal_mcr(void)
-{
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
- *
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
- *
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
- */
- used = centaur_mcr_compute(6, 31);
-
- /*
- * Wipe unused MCRs
- */
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-static void __cpuinit winchip2_create_optimal_mcr(void)
-{
- u32 lo, hi;
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
- *
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
- */
- used = centaur_mcr_compute(6, 25);
-
- /*
- * Mark the registers we are using.
- */
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for (i = 0; i < used; i++)
- lo |= 1<<(9+i);
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
- /*
- * Wipe unused MCRs
- */
-
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-/*
- * Handle the MCR key on the Winchip 2.
- */
-static void __cpuinit winchip2_unprotect_mcr(void)
-{
- u32 lo, hi;
- u32 key;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- key = (lo>>17) & 7;
- lo |= key<<6; /* replace with unlock key */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-static void __cpuinit winchip2_protect_mcr(void)
-{
- u32 lo, hi;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-#endif /* CONFIG_X86_OOSTORE */
-
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -247,7 +19,7 @@ static void __cpuinit winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __cpuinit init_c3(struct cpuinfo_x86 *c)
+static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -260,7 +32,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
wrmsr(MSR_VIA_FCR, lo, hi);
- printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+ pr_info("CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
@@ -268,13 +40,13 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
wrmsr(MSR_VIA_RNG, lo, hi);
- printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+ pr_info("CPU: Enabled h/w RNG\n");
}
/* store Centaur Extended Feature Flags as
* word 5 of the CPU capability bit array
*/
- c->x86_capability[5] = cpuid_edx(0xC0000001);
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
@@ -294,7 +66,8 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
- cpu_detect_cache_sizes(c);
+ if (c->x86 >= 7)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
enum {
@@ -318,26 +91,27 @@ enum {
EAMD3D = 1<<20,
};
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+static void early_init_centaur(struct cpuinfo_x86 *c)
{
- switch (c->x86) {
#ifdef CONFIG_X86_32
- case 5:
- /* Emulate MTRRs using Centaur's MCR. */
+ /* Emulate MTRRs using Centaur's MCR. */
+ if (c->x86 == 5)
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
- break;
#endif
- case 6:
- if (c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- break;
- }
+ if ((c->x86 == 6 && c->x86_model >= 0xf) ||
+ (c->x86 >= 7))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
}
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
char *name;
@@ -353,33 +127,32 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
clear_cpu_cap(c, 0*32+31);
#endif
early_init_centaur(c);
- switch (c->x86) {
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
#ifdef CONFIG_X86_32
- case 5:
+ if (c->x86 == 5) {
switch (c->x86_model) {
case 4:
name = "C6";
fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
fcr_clr = DPDC;
- printk(KERN_NOTICE "Disabling bugged TSC.\n");
+ pr_notice("Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
-#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- *
- * The C6 original lacks weak read order
- *
- * Note 0x120 is write only on Winchip 1
- */
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
break;
case 8:
- switch (c->x86_mask) {
+ switch (c->x86_stepping) {
default:
name = "2";
break;
@@ -393,40 +166,12 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
default:
name = "??";
@@ -436,11 +181,11 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
newlo = (lo|fcr_set) & (~fcr_clr);
if (newlo != lo) {
- printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+ pr_info("Centaur FCR was 0x%X now 0x%X\n",
lo, newlo);
wrmsr(MSR_IDT_FCR1, newlo, hi);
} else {
- printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+ pr_info("Centaur FCR is 0x%X\n", lo);
}
/* Emulate MTRRs using Centaur's MCR. */
set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
@@ -457,21 +202,21 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
c->x86_cache_size = (cc>>24)+(dd>>24);
}
sprintf(c->x86_model_id, "WinChip %s", name);
- break;
+ }
#endif
- case 6:
+ if (c->x86 == 6 || c->x86 >= 7)
init_c3(c);
- break;
- }
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
+
+ init_ia32_feat_ctl(c);
}
-static unsigned int __cpuinit
+#ifdef CONFIG_X86_32
+static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
-#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -482,18 +227,20 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
* - Note, it seems this may only be in engineering samples.
*/
if ((c->x86 == 6) && (c->x86_model == 9) &&
- (c->x86_mask == 1) && (size == 65))
+ (c->x86_stepping == 1) && (size == 65))
size -= 1;
-#endif
return size;
}
+#endif
-static const struct cpu_dev __cpuinitconst centaur_cpu_dev = {
+static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
- .c_size_cache = centaur_size_cache,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 548bd039784e..e7ab22fce3b5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,69 +1,185 @@
-#include <linux/bootmem.h>
+// SPDX-License-Identifier: GPL-2.0-only
+/* cpu_feature_enabled() cannot be used this early */
+#define USE_EARLY_PGTABLE_L5
+
+#include <linux/memblock.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kvm_types.h>
#include <linux/percpu.h>
#include <linux/string.h>
+#include <linux/ctype.h>
#include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
+#include <linux/mem_encrypt.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
-
-#include <asm/stackprotector.h>
+#include <linux/syscore_ops.h>
+#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
+#include <linux/utsname.h>
+#include <linux/efi.h>
+
+#include <asm/alternative.h>
+#include <asm/cmdline.h>
+#include <asm/cpuid/api.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
+#include <asm/doublefault.h>
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
-#include <asm/pgtable.h>
#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
#include <asm/apic.h>
#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
#include <linux/numa.h>
+#include <asm/numa.h>
#include <asm/asm.h>
+#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/cacheinfo.h>
+#include <asm/memtype.h>
#include <asm/microcode.h>
-#include <asm/microcode_intel.h>
-
-#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
-#endif
+#include <asm/ia32.h>
+#include <asm/set_memory.h>
+#include <asm/traps.h>
+#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
-/* all of these masks are initialized in setup_cpu_local_masks() */
-cpumask_var_t cpu_initialized_mask;
-cpumask_var_t cpu_callout_mask;
-cpumask_var_t cpu_callin_mask;
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
+/* Used for modules: built-in code uses runtime constants */
+unsigned long USER_PTR_MAX;
+EXPORT_SYMBOL(USER_PTR_MAX);
+
+u32 elf_hwcap2 __read_mostly;
+
+/* Number of siblings per CPU package */
+unsigned int __max_threads_per_core __ro_after_init = 1;
+EXPORT_SYMBOL(__max_threads_per_core);
+
+unsigned int __max_dies_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__max_dies_per_package);
+
+unsigned int __max_logical_packages __ro_after_init = 1;
+EXPORT_SYMBOL(__max_logical_packages);
+
+unsigned int __num_cores_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_cores_per_package);
+
+unsigned int __num_threads_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_threads_per_package);
+
+static struct ppin_info {
+ int feature;
+ int msr_ppin_ctl;
+ int msr_ppin;
+} ppin_info[] = {
+ [X86_VENDOR_INTEL] = {
+ .feature = X86_FEATURE_INTEL_PPIN,
+ .msr_ppin_ctl = MSR_PPIN_CTL,
+ .msr_ppin = MSR_PPIN
+ },
+ [X86_VENDOR_AMD] = {
+ .feature = X86_FEATURE_AMD_PPIN,
+ .msr_ppin_ctl = MSR_AMD_PPIN_CTL,
+ .msr_ppin = MSR_AMD_PPIN
+ },
+};
-/* representing cpus for which sibling maps can be computed */
-cpumask_var_t cpu_sibling_setup_mask;
+static const struct x86_cpu_id ppin_cpuids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_AMD_PPIN, &ppin_info[X86_VENDOR_AMD]),
+ X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
+
+ /* Legacy models without CPUID enumeration */
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+
+ {}
+};
-/* correctly size the local cpu masks */
-void __init setup_cpu_local_masks(void)
+static void ppin_init(struct cpuinfo_x86 *c)
{
- alloc_bootmem_cpumask_var(&cpu_initialized_mask);
- alloc_bootmem_cpumask_var(&cpu_callin_mask);
- alloc_bootmem_cpumask_var(&cpu_callout_mask);
- alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+ const struct x86_cpu_id *id;
+ unsigned long long val;
+ struct ppin_info *info;
+
+ id = x86_match_cpu(ppin_cpuids);
+ if (!id)
+ return;
+
+ /*
+ * Testing the presence of the MSR is not enough. Need to check
+ * that the PPIN_CTL allows reading of the PPIN.
+ */
+ info = (struct ppin_info *)id->driver_data;
+
+ if (rdmsrq_safe(info->msr_ppin_ctl, &val))
+ goto clear_ppin;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN locked in disabled mode */
+ goto clear_ppin;
+ }
+
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
+ wrmsrq_safe(info->msr_ppin_ctl, val | 2UL);
+ rdmsrq_safe(info->msr_ppin_ctl, &val);
+ }
+
+ /* Is the enable bit set? */
+ if (val & 2UL) {
+ c->ppin = native_rdmsrq(info->msr_ppin);
+ set_cpu_cap(c, info->feature);
+ return;
+ }
+
+clear_ppin:
+ setup_clear_cpu_cap(info->feature);
}
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
+static void default_init(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_64
cpu_detect_cache_sizes(c);
@@ -80,13 +196,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst default_cpu = {
+static const struct cpu_dev default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
.c_x86_vendor = X86_VENDOR_UNKNOWN,
};
-static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+static const struct cpu_dev *this_cpu = &default_cpu;
DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
@@ -98,97 +214,83 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- GDT_STACK_CANARY_INIT
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+SYM_PIC_ALIAS(gdt_page);
-static int __init x86_xsave_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- return 1;
-}
-__setup("noxsave", x86_xsave_setup);
-
-static int __init x86_xsaveopt_setup(char *s)
+#ifdef CONFIG_X86_64
+static int __init x86_nopcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+ /* nopcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-#ifdef CONFIG_X86_32
-static int cachesize_override __cpuinitdata = -1;
-static int disable_x86_serial_nr __cpuinitdata = 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 0;
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 0;
}
-__setup("cachesize=", cachesize_setup);
+early_param("nopcid", x86_nopcid_setup);
+#endif
-static int __init x86_fxsr_setup(char *s)
+static int __init x86_noinvpcid_setup(char *s)
{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
- return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
+ /* noinvpcid doesn't accept parameters */
+ if (s)
+ return -EINVAL;
-static int __init x86_sep_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_SEP);
- return 1;
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_INVPCID))
+ return 0;
+
+ setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+ pr_info("noinvpcid: INVPCID feature disabled\n");
+ return 0;
}
-__setup("nosep", x86_sep_setup);
+early_param("noinvpcid", x86_noinvpcid_setup);
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -211,16 +313,27 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
+}
+
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
}
+__setup("cachesize=", cachesize_setup);
/* Probe for the CPUID instruction */
-int __cpuinit have_cpuid_p(void)
+bool cpuid_feature(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
unsigned long lo, hi;
@@ -233,7 +346,7 @@ static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
lo |= 0x200000;
wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
- printk(KERN_NOTICE "CPU serial number disabled.\n");
+ pr_notice("CPU serial number disabled.\n");
clear_cpu_cap(c, X86_FEATURE_PN);
/* Disabling the serial number may affect the cpuid level */
@@ -247,45 +360,298 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- return 1;
}
-static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+#endif
+
+static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_SMEP))
+ cr4_set_bits(X86_CR4_SMEP);
+}
+
+static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+{
+ unsigned long eflags = native_save_fl();
+
+ /* This should have been cleared long ago */
+ BUG_ON(eflags & X86_EFLAGS_AC);
+
+ if (cpu_has(c, X86_FEATURE_SMAP))
+ cr4_set_bits(X86_CR4_SMAP);
+}
+
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ pr_info_once("x86/cpu: User Mode Instruction Prevention (UMIP) activated\n");
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
+static __always_inline void setup_lass(struct cpuinfo_x86 *c)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LASS))
+ return;
+
+ /*
+ * Legacy vsyscall page access causes a #GP when LASS is active.
+ * Disable LASS because the #GP handler doesn't support vsyscall
+ * emulation.
+ *
+ * Also disable LASS when running under EFI, as some runtime and
+ * boot services rely on 1:1 mappings in the lower half.
+ */
+ if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) ||
+ IS_ENABLED(CONFIG_EFI)) {
+ setup_clear_cpu_cap(X86_FEATURE_LASS);
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_LASS);
+}
+
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
+static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
+static unsigned long cr4_pinned_bits __ro_after_init;
+
+void native_write_cr0(unsigned long val)
+{
+ unsigned long bits_missing = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) {
+ bits_missing = X86_CR0_WP;
+ val |= bits_missing;
+ goto set_register;
+ }
+ /* Warn after we've set the missing bits. */
+ WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n");
+ }
+}
+EXPORT_SYMBOL(native_write_cr0);
+
+void __no_profile native_write_cr4(unsigned long val)
{
+ unsigned long bits_changed = 0;
+
+set_register:
+ asm volatile("mov %0,%%cr4": "+r" (val) : : "memory");
+
+ if (static_branch_likely(&cr_pinning)) {
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
+ goto set_register;
+ }
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
+ }
}
+#if IS_MODULE(CONFIG_LKDTM)
+EXPORT_SYMBOL_GPL(native_write_cr4);
#endif
-static __init int setup_disable_smep(char *arg)
+void cr4_update_irqsoff(unsigned long set, unsigned long clear)
{
- setup_clear_cpu_cap(X86_FEATURE_SMEP);
+ unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
+
+ lockdep_assert_irqs_disabled();
+
+ newval = (cr4 & ~clear) | set;
+ if (newval != cr4) {
+ this_cpu_write(cpu_tlbstate.cr4, newval);
+ __write_cr4(newval);
+ }
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff);
+
+/* Read the CR4 shadow. */
+unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow);
+
+void cr4_init(void)
+{
+ unsigned long cr4 = __read_cr4();
+
+ if (boot_cpu_has(X86_FEATURE_PCID))
+ cr4 |= X86_CR4_PCIDE;
+ if (static_branch_likely(&cr_pinning))
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
+
+ __write_cr4(cr4);
+
+ /* Initialize cr4 shadow for this CPU. */
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+}
+
+/*
+ * Once CPU feature detection is finished (and boot params have been
+ * parsed), record any of the sensitive CR bits that are set, and
+ * enable CR pinning.
+ */
+static void __init setup_cr_pinning(void)
+{
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
+ static_key_enable(&cr_pinning.key);
+}
+
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
return 1;
}
-__setup("nosmep", setup_disable_smep);
+__setup("nofsgsbase", x86_nofsgsbase_setup);
-static __always_inline void setup_smep(struct cpuinfo_x86 *c)
+/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ if (c == &boot_cpu_data) {
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ return;
+ /*
+ * Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
+ * bit to be set. Enforce it.
+ */
+ setup_force_cpu_cap(X86_FEATURE_OSPKE);
+
+ } else if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ return;
+ }
+
+ cr4_set_bits(X86_CR4_PKE);
+ /* Load the default PKRU value */
+ pkru_write_default();
}
-static __init int setup_disable_smap(char *arg)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
{
- setup_clear_cpu_cap(X86_FEATURE_SMAP);
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
return 1;
}
-__setup("nosmap", setup_disable_smap);
+__setup("nopku", setup_disable_pku);
+#endif
-static __always_inline void setup_smap(struct cpuinfo_x86 *c)
+#ifdef CONFIG_X86_KERNEL_IBT
+
+__noendbr u64 ibt_save(bool disable)
{
- unsigned long eflags;
+ u64 msr = 0;
- /* This should have been cleared long ago */
- raw_local_save_flags(eflags);
- BUG_ON(eflags & X86_EFLAGS_AC);
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ if (disable)
+ wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ }
- if (cpu_has(c, X86_FEATURE_SMAP))
- set_in_cr4(X86_CR4_SMAP);
+ return msr;
+}
+
+__noendbr void ibt_restore(u64 save)
+{
+ u64 msr;
+
+ if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, msr);
+ msr &= ~CET_ENDBR_EN;
+ msr |= (save & CET_ENDBR_EN);
+ wrmsrq(MSR_IA32_S_CET, msr);
+ }
+}
+
+#endif
+
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+ bool user_shstk, kernel_ibt;
+
+ if (!IS_ENABLED(CONFIG_X86_CET))
+ return;
+
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrq(MSR_IA32_S_CET, 0);
+
+ cr4_set_bits(X86_CR4_CET);
+
+ if (kernel_ibt && ibt_selftest()) {
+ pr_err("IBT selftest: Failed!\n");
+ wrmsrq(MSR_IA32_S_CET, 0);
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+ }
+}
+
+__noendbr void cet_disable(void)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrq(MSR_IA32_S_CET, 0);
+ wrmsrq(MSR_IA32_U_CET, 0);
}
/*
@@ -298,15 +664,15 @@ struct cpuid_dependent_feature {
u32 level;
};
-static const struct cpuid_dependent_feature __cpuinitconst
+static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
-static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
+static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
{
const struct cpuid_dependent_feature *df;
@@ -330,9 +696,8 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
if (!warn)
continue;
- printk(KERN_WARNING
- "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
- x86_cap_flags[df->feature], df->level);
+ pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
+ x86_cap_flags[df->feature], df->level);
}
}
@@ -344,9 +709,10 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
*/
/* Look up CPU names by table lookup. */
-static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
+static const char *table_lookup_model(struct cpuinfo_x86 *c)
{
- const struct cpu_model_info *info;
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -354,52 +720,95 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
if (!this_cpu)
return NULL;
- info = this_cpu->c_models;
+ info = this_cpu->legacy_models;
- while (info && info->family) {
+ while (info->family) {
if (info->family == c->x86)
return info->model_names[c->x86_model];
info++;
}
+#endif
return NULL; /* Not found */
}
-__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
-__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
+/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- loadsegment(gs, 0);
- wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif
- load_stack_canary_segment();
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
}
+EXPORT_SYMBOL_FOR_KVM(load_direct_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
- */
-void switch_to_new_gdt(int cpu)
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- /* Reload the per-cpu base */
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
+ */
+void __init switch_gdt_and_percpu_base(int cpu)
+{
+ load_direct_gdt(cpu);
- load_percpu_segment(cpu);
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrq() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrq() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
-static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {};
+static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
- char *p, *q;
+ char *p, *q, *s;
if (c->extended_cpuid_level < 0x80000004)
return;
@@ -410,22 +819,24 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
c->x86_model_id[48] = 0;
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
+ /* Trim whitespace */
+ p = q = s = &c->x86_model_id[0];
+
while (*p == ' ')
p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
+
+ while (*p) {
+ /* Note the last non-whitespace index */
+ if (!isspace(*p))
+ s = q;
+
+ *q++ = *p++;
}
+
+ *(s + 1) = '\0';
}
-void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
+void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -450,8 +861,8 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
#else
/* do processor-specific cache resizing */
- if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c, l2size);
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
@@ -464,87 +875,27 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_cache_size = l2size;
}
-u16 __read_mostly tlb_lli_4k[NR_INFO];
-u16 __read_mostly tlb_lli_2m[NR_INFO];
-u16 __read_mostly tlb_lli_4m[NR_INFO];
-u16 __read_mostly tlb_lld_4k[NR_INFO];
-u16 __read_mostly tlb_lld_2m[NR_INFO];
-u16 __read_mostly tlb_lld_4m[NR_INFO];
-
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8 __read_mostly tlb_flushall_shift = -1;
+u16 __read_mostly tlb_lli_4k;
+u16 __read_mostly tlb_lli_2m;
+u16 __read_mostly tlb_lli_4m;
+u16 __read_mostly tlb_lld_4k;
+u16 __read_mostly tlb_lld_2m;
+u16 __read_mostly tlb_lld_4m;
+u16 __read_mostly tlb_lld_1g;
-void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
- printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
- "tlb_flushall_shift: %d\n",
- tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
- tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_flushall_shift);
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_X86_HT
- u32 eax, ebx, ecx, edx;
- int index_msb, core_bits;
- static bool printed;
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
+ tlb_lli_4k, tlb_lli_2m, tlb_lli_4m);
- if (!cpu_has(c, X86_FEATURE_HT))
- return;
-
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- goto out;
-
- if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
-
- if (smp_num_siblings == 1) {
- printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
- goto out;
- }
-
- if (smp_num_siblings <= 1)
- goto out;
-
- index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-
-out:
- if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
- }
-#endif
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g);
}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -563,15 +914,14 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
}
}
- printk_once(KERN_ERR
- "CPU: vendor_id '%s' unknown, using generic init.\n" \
- "CPU: Your system may be unstable.\n", v);
+ pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+ "CPU: Your system may be unstable.\n", v);
c->x86_vendor = X86_VENDOR_UNKNOWN;
this_cpu = &default_cpu;
}
-void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
+void cpu_detect(struct cpuinfo_x86 *c)
{
/* Get vendor name */
cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -585,14 +935,9 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
u32 junk, tfms, cap0, misc;
cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
- c->x86 = (tfms >> 8) & 0xf;
- c->x86_model = (tfms >> 4) & 0xf;
- c->x86_mask = tfms & 0xf;
-
- if (c->x86 == 0xf)
- c->x86 += (tfms >> 20) & 0xff;
- if (c->x86 >= 0x6)
- c->x86_model += ((tfms >> 16) & 0xf) << 4;
+ c->x86 = x86_family(tfms);
+ c->x86_model = x86_model(tfms);
+ c->x86_stepping = x86_stepping(tfms);
if (cap0 & (1<<19)) {
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
@@ -601,60 +946,172 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
}
}
-void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
+static void init_speculation_control(struct cpuinfo_x86 *c)
{
- u32 tfms, xlvl;
- u32 ebx;
+ /*
+ * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
+ * and they also have a different bit for STIBP support. Also,
+ * a hypervisor might have set the individual AMD bits even on
+ * Intel CPUs, for finer-grained selection of what's available.
+ */
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+
+ if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) ||
+ cpu_has(c, X86_FEATURE_VIRT_SSBD))
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBRS)) {
+ set_cpu_cap(c, X86_FEATURE_IBRS);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB))
+ set_cpu_cap(c, X86_FEATURE_IBPB);
+
+ if (cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ set_cpu_cap(c, X86_FEATURE_STIBP);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ }
+
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
+ set_cpu_cap(c, X86_FEATURE_SSBD);
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
+ }
+}
+
+void get_cpu_cap(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
/* Intel-defined flags: level 0x00000001 */
if (c->cpuid_level >= 0x00000001) {
- u32 capability, excap;
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
- c->x86_capability[0] = capability;
- c->x86_capability[4] = excap;
+ c->x86_capability[CPUID_1_ECX] = ecx;
+ c->x86_capability[CPUID_1_EDX] = edx;
}
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
/* Additional Intel-defined flags: level 0x00000007 */
if (c->cpuid_level >= 0x00000007) {
- u32 eax, ebx, ecx, edx;
-
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-
- c->x86_capability[9] = ebx;
+ c->x86_capability[CPUID_7_0_EBX] = ebx;
+ c->x86_capability[CPUID_7_ECX] = ecx;
+ c->x86_capability[CPUID_7_EDX] = edx;
+
+ /* Check valid sub-leaf index before accessing it */
+ if (eax >= 1) {
+ cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_7_1_EAX] = eax;
+ }
}
- /* AMD-defined flags: level 0x80000001 */
- xlvl = cpuid_eax(0x80000000);
- c->extended_cpuid_level = xlvl;
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- if ((xlvl & 0xffff0000) == 0x80000000) {
- if (xlvl >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
+ c->x86_capability[CPUID_D_1_EAX] = eax;
}
- if (c->extended_cpuid_level >= 0x80000008) {
- u32 eax = cpuid_eax(0x80000008);
+ /*
+ * Check if extended CPUID leaves are implemented: Max extended
+ * CPUID leaf must be in the 0x80000001-0x8000ffff range.
+ */
+ eax = cpuid_eax(0x80000000);
+ c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0;
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
+ if (c->extended_cpuid_level >= 0x80000001) {
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[CPUID_8000_0001_ECX] = ecx;
+ c->x86_capability[CPUID_8000_0001_EDX] = edx;
}
-#ifdef CONFIG_X86_32
- else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36))
- c->x86_phys_bits = 36;
-#endif
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ if (c->extended_cpuid_level >= 0x80000008) {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[CPUID_8000_0008_EBX] = ebx;
+ }
+
+ if (c->extended_cpuid_level >= 0x8000000a)
+ c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
+
+ if (c->extended_cpuid_level >= 0x8000001f)
+ c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
init_scattered_cpuid_features(c);
+ init_speculation_control(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
}
-static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+void get_cpu_address_sizes(struct cpuinfo_x86 *c)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!cpu_has(c, X86_FEATURE_CPUID) ||
+ (c->extended_cpuid_level < 0x80000008)) {
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ c->x86_clflush_size = 64;
+ c->x86_phys_bits = 36;
+ c->x86_virt_bits = 48;
+ } else {
+ c->x86_clflush_size = 32;
+ c->x86_virt_bits = 32;
+ c->x86_phys_bits = 32;
+
+ if (cpu_has(c, X86_FEATURE_PAE) ||
+ cpu_has(c, X86_FEATURE_PSE36))
+ c->x86_phys_bits = 36;
+ }
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
+ }
+
+ c->x86_cache_bits = c->x86_phys_bits;
+ c->x86_cache_alignment = c->x86_clflush_size;
+}
+
+static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -675,7 +1132,629 @@ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
+}
+
+#define NO_SPECULATION BIT(0)
+#define NO_MELTDOWN BIT(1)
+#define NO_SSB BIT(2)
+#define NO_L1TF BIT(3)
+#define NO_MDS BIT(4)
+#define MSBDS_ONLY BIT(5)
+#define NO_SWAPGS BIT(6)
+#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
+#define NO_MMIO BIT(9)
+#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
+
+#define VULNWL(vendor, family, model, whitelist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
+
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
+
+#define VULNWL_AMD(family, whitelist) \
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
+
+#define VULNWL_HYGON(family, whitelist) \
+ VULNWL(HYGON, family, X86_MODEL_ANY, whitelist)
+
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 5, X86_MODEL_ANY, NO_SPECULATION),
+ VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
+
+ /* Intel Family 6 */
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
+
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
+
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+
+ /*
+ * Technically, swapgs isn't serializing on AMD (despite it previously
+ * being documented as such in the APM). But according to AMD, %gs is
+ * updated non-speculatively, and the issuing of %gs-relative memory
+ * operands will be blocked until the %gs update completes, which is
+ * good enough for our purposes.
+ */
+
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+
+ /* AMD Family 0xf - 0x12 */
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ {}
+};
+
+#define VULNBL(vendor, family, model, blacklist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
+
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
+
+#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \
+ X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues)
+
+#define VULNBL_AMD(family, blacklist) \
+ VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
+
+#define VULNBL_HYGON(family, blacklist) \
+ VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
+
+#define SRBDS BIT(0)
+/* CPU is affected by X86_BUG_MMIO_STALE_DATA */
+#define MMIO BIT(1)
+/* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
+#define MMIO_SBDS BIT(2)
+/* CPU is affected by RETbleed, speculating where you would not expect it */
+#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
+/* CPU is affected by SRSO */
+#define SRSO BIT(5)
+/* CPU is affected by GDS */
+#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
+/* CPU is affected by Indirect Target Selection */
+#define ITS BIT(8)
+/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */
+#define ITS_NATIVE_ONLY BIT(9)
+/* CPU is affected by Transient Scheduler Attacks */
+#define TSA BIT(10)
+/* CPU is affected by VMSCAPE */
+#define VMSCAPE BIT(11)
+
+static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY),
+ VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE),
+
+ VULNBL_AMD(0x15, RETBLEED),
+ VULNBL_AMD(0x16, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE),
+ VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE),
+ VULNBL_AMD(0x1a, SRSO | VMSCAPE),
+ {}
+};
+
+static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(table);
+
+ return m && !!(m->driver_data & which);
+}
+
+u64 x86_read_arch_cap_msr(void)
+{
+ u64 x86_arch_cap_msr = 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
+
+ return x86_arch_cap_msr;
+}
+
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
+{
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
+}
+
+static bool __init vulnerable_to_its(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_ITS_NO)
+ return false;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ /* None of the affected CPUs have BHI_CTRL */
+ if (boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ /*
+ * If a VMM did not expose ITS_NO, assume that a guest could
+ * be running on a vulnerable hardware or may migrate to such
+ * hardware.
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return true;
+
+ if (cpu_matches(cpu_vuln_blacklist, ITS))
+ return true;
+
+ return false;
+}
+
+static struct x86_cpu_id cpu_latest_microcode[] = {
+#include "microcode/intel-ucode-defs.h"
+ {}
+};
+
+static bool __init cpu_has_old_microcode(void)
+{
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode);
+
+ /* Give unknown CPUs a pass: */
+ if (!m) {
+ /* Intel CPUs should be in the list. Warn if not: */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ pr_info("x86/CPU: Model not found in latest microcode list\n");
+ return false;
+ }
+
+ /*
+ * Hosts usually lie to guests with a super high microcode
+ * version. Just ignore what hosts tell guests:
+ */
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ /* Consider all debug microcode to be old: */
+ if (boot_cpu_data.microcode & BIT(31))
+ return true;
+
+ /* Give new microcode a pass: */
+ if (boot_cpu_data.microcode >= m->driver_data)
+ return false;
+
+ /* Uh oh, too old: */
+ return true;
+}
+
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
+{
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
+
+ if (cpu_has_old_microcode()) {
+ pr_warn("x86/CPU: Running old microcode\n");
+ setup_force_cpu_bug(X86_BUG_OLD_MICROCODE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+
+ /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
+ setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) {
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
+
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
+ */
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
+ setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
+ setup_force_cpu_bug(X86_BUG_MDS);
+ if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
+ }
+
+ if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
+ setup_force_cpu_bug(X86_BUG_SWAPGS);
+
+ /*
+ * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when:
+ * - TSX is supported or
+ * - TSX_CTRL is present
+ *
+ * TSX_CTRL check is needed for cases when TSX could be disabled before
+ * the kernel boot e.g. kexec.
+ * TSX_CTRL check alone is not sufficient for cases when the microcode
+ * update is not present or running as guest that don't get TSX_CTRL.
+ */
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
+ (cpu_has(c, X86_FEATURE_RTM) ||
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
+ setup_force_cpu_bug(X86_BUG_TAA);
+
+ /*
+ * SRBDS affects CPUs which support RDRAND or RDSEED and are listed
+ * in the vulnerability blacklist.
+ *
+ * Some of the implications and mitigation of Shared Buffers Data
+ * Sampling (SBDS) are similar to SRBDS. Give SBDS same treatment as
+ * SRBDS.
+ */
+ if ((cpu_has(c, X86_FEATURE_RDRAND) ||
+ cpu_has(c, X86_FEATURE_RDSEED)) &&
+ cpu_matches(cpu_vuln_blacklist, SRBDS | MMIO_SBDS))
+ setup_force_cpu_bug(X86_BUG_SRBDS);
+
+ /*
+ * Processor MMIO Stale Data bug enumeration
+ *
+ * Affected CPU list is generally enough to enumerate the vulnerability,
+ * but for virtualization case check for ARCH_CAP MSR bits also, VMM may
+ * not want the guest to enumerate the bug.
+ */
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
+ if (cpu_matches(cpu_vuln_blacklist, MMIO))
+ setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
+ }
+
+ if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
+ }
+
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
+
+ if (!cpu_has(c, X86_FEATURE_SRSO_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, SRSO))
+ setup_force_cpu_bug(X86_BUG_SRSO);
+ }
+
+ /*
+ * Check if CPU is vulnerable to GDS. If running in a virtual machine on
+ * an affected processor, the VMM may have disabled the use of GATHER by
+ * disabling AVX2. The only way to do this in HW is to clear XCR0[2],
+ * which means that AVX will be disabled.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
+ boot_cpu_has(X86_FEATURE_AVX))
+ setup_force_cpu_bug(X86_BUG_GDS);
+
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /*
+ * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with
+ * BHI_NO still need to use the BHI mitigation to prevent Intra-mode
+ * attacks. When virtualized, eIBRS could be hidden, assume vulnerable.
+ */
+ if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
+ if (vulnerable_to_its(x86_arch_cap_msr)) {
+ setup_force_cpu_bug(X86_BUG_ITS);
+ if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY))
+ setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY);
+ }
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) ||
+ !cpu_has(c, X86_FEATURE_TSA_L1_NO)) {
+ if (cpu_matches(cpu_vuln_blacklist, TSA) ||
+ /* Enable bug on Zen guests to allow for live migration. */
+ (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN)))
+ setup_force_cpu_bug(X86_BUG_TSA);
+ }
+ }
+
+ /*
+ * Set the bug only on bare-metal. A nested hypervisor should already be
+ * deploying IBPB to isolate itself from nested guests.
+ */
+ if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) &&
+ !boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ setup_force_cpu_bug(X86_BUG_VMSCAPE);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
+ return;
+
+ /* Rogue Data Cache Load? No! */
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
+ return;
+
+ setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+
+ if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
+ return;
+
+ setup_force_cpu_bug(X86_BUG_L1TF);
+}
+
+/*
+ * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
+ * unfortunately, that's not true in practice because of early VIA
+ * chips and (more importantly) broken virtualizers that are not easy
+ * to detect. In the latter case it doesn't even *fail* reliably, so
+ * probing for it doesn't even work. Disable it completely on 32-bit
+ * unless we can find a reliable way to detect all the broken cases.
+ * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
+ */
+static void detect_nopl(void)
+{
+#ifdef CONFIG_X86_32
+ setup_clear_cpu_cap(X86_FEATURE_NOPL);
+#else
+ setup_force_cpu_cap(X86_FEATURE_NOPL);
+#endif
+}
+
+static inline bool parse_set_clear_cpuid(char *arg, bool set)
+{
+ char *opt;
+ int taint = 0;
+
+ while (arg) {
+ bool found __maybe_unused = false;
+ unsigned int bit;
+
+ opt = strsep(&arg, ",");
+
+ /*
+ * Handle naked numbers first for feature flags which don't
+ * have names. It doesn't make sense for a bug not to have a
+ * name so don't handle bug flags here.
+ */
+ if (!kstrtouint(opt, 10, &bit)) {
+ if (bit < NCAPINTS * 32) {
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU feature flag:");
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU feature flag:");
+ setup_clear_cpu_cap(bit);
+ }
+ /* empty-string, i.e., ""-defined feature flags */
+ if (!x86_cap_flags[bit])
+ pr_cont(" %d:%d\n", bit >> 5, bit & 31);
+ else
+ pr_cont(" %s\n", x86_cap_flags[bit]);
+
+ taint++;
+ }
+ /*
+ * The assumption is that there are no feature names with only
+ * numbers in the name thus go to the next argument.
+ */
+ continue;
+ }
+
+ for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) {
+ const char *flag;
+ const char *kind;
+
+ if (bit < 32 * NCAPINTS) {
+ flag = x86_cap_flags[bit];
+ kind = "feature";
+ } else {
+ kind = "bug";
+ flag = x86_bug_flags[bit - (32 * NCAPINTS)];
+ }
+
+ if (!flag)
+ continue;
+
+ if (strcmp(flag, opt))
+ continue;
+
+ if (set) {
+ pr_warn("setcpuid: force-enabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_force_cpu_cap(bit);
+ } else {
+ pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n",
+ kind, flag);
+ setup_clear_cpu_cap(bit);
+ }
+ taint++;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt);
+ }
+
+ return taint;
+}
+
+
+/*
+ * We parse cpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init cpu_parse_early_param(void)
+{
+ bool cpuid_taint = false;
+ char arg[128];
+ int arglen;
+
+#ifdef CONFIG_X86_32
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+#ifdef CONFIG_MATH_EMULATION
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+#else
+ pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n");
+#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr"))
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
#endif
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, false);
+
+ arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg));
+ if (arglen > 0)
+ cpuid_taint |= parse_set_clear_cpuid(arg, true);
+
+ if (cpuid_taint) {
+ pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
}
/*
@@ -684,58 +1763,82 @@ static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
* cache alignment.
* The others are not touched to avoid unwanted side effects.
*
- * WARNING: this function is only called on the BP. Don't add code here
- * that is supposed to run on all CPUs.
+ * WARNING: this function is only called on the boot CPU. Don't add code
+ * here that is supposed to run on all CPUs.
*/
static void __init early_identify_cpu(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- c->x86_clflush_size = 64;
- c->x86_phys_bits = 36;
- c->x86_virt_bits = 48;
-#else
- c->x86_clflush_size = 32;
- c->x86_phys_bits = 32;
- c->x86_virt_bits = 32;
-#endif
- c->x86_cache_alignment = c->x86_clflush_size;
-
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
+ if (cpuid_feature()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
+ get_cpu_cap(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_parse_early_param();
+
+ cpu_init_topology(c);
+
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
+
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
+ check_cpufeature_deps(c);
+
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_init_topology(c);
+ }
- cpu_detect(c);
- get_cpu_vendor(c);
- get_cpu_cap(c);
- fpu_detect(c);
+ setup_force_cpu_cap(X86_FEATURE_ALWAYS);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ cpu_set_bug_bits(c);
- c->cpu_index = 0;
- filter_cpuid_features(c, false);
+ sld_setup(c);
- if (this_cpu->c_bsp_init)
- this_cpu->c_bsp_init(c);
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
- setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+ /*
+ * Later in the boot process pgtable_l5_enabled() relies on
+ * cpu_feature_enabled(X86_FEATURE_LA57). If 5-level paging is not
+ * enabled by this point we need to clear the feature bit to avoid
+ * false-positives at the later stage.
+ *
+ * pgtable_l5_enabled() can be false here for several reasons:
+ * - 5-level paging is disabled compile-time;
+ * - it's 32-bit kernel;
+ * - machine doesn't support 5-level paging;
+ * - user specified 'no5lvl' in kernel command line.
+ */
+ if (!pgtable_l5_enabled())
+ setup_clear_cpu_cap(X86_FEATURE_LA57);
+
+ detect_nopl();
+ mca_bsp_init(c);
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
-#ifdef CONFIG_PROCESSOR_SELECT
- printk(KERN_INFO "KERNEL supported cpus:\n");
-#endif
-
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
@@ -743,90 +1846,144 @@ void __init early_cpu_init(void)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
+void __init early_cpu_init(void)
+{
#ifdef CONFIG_PROCESSOR_SELECT
- {
- unsigned int j;
-
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
- }
- }
+ unsigned int i, j;
+
+ pr_info("KERNEL supported cpus:\n");
#endif
+
+ init_cpu_devs();
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devs[i]->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
+ }
}
+#endif
+
early_identify_cpu(&boot_cpu_data);
}
-/*
- * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
- * unfortunately, that's not true in practice because of early VIA
- * chips and (more importantly) broken virtualizers that are not easy
- * to detect. In the latter case it doesn't even *fail* reliably, so
- * probing for it doesn't even work. Disable it completely on 32-bit
- * unless we can find a reliable way to detect all the broken cases.
- * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
- */
-static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
+static bool detect_null_seg_behavior(void)
{
-#ifdef CONFIG_X86_32
- clear_cpu_cap(c, X86_FEATURE_NOPL);
-#else
- set_cpu_cap(c, X86_FEATURE_NOPL);
-#endif
+ /*
+ * Empirically, writing zero to a segment selector on AMD does
+ * not clear the base, whereas writing zero to a segment
+ * selector on Intel does clear the base. Intel's behavior
+ * allows slightly faster context switches in the common case
+ * where GS is unused by the prev and next threads.
+ *
+ * Since neither vendor documents this anywhere that I can see,
+ * detect it directly instead of hard-coding the choice by
+ * vendor.
+ *
+ * I've designated AMD's behavior as the "bug" because it's
+ * counterintuitive and less friendly.
+ */
+
+ unsigned long old_base, tmp;
+ rdmsrq(MSR_FS_BASE, old_base);
+ wrmsrq(MSR_FS_BASE, 1);
+ loadsegment(fs, 0);
+ rdmsrq(MSR_FS_BASE, tmp);
+ wrmsrq(MSR_FS_BASE, old_base);
+ return tmp == 0;
+}
+
+void check_null_seg_clears_base(struct cpuinfo_x86 *c)
+{
+ /* BUG_NULL_SEG is only relevant with 64bit userspace */
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
+ return;
+
+ /*
+ * CPUID bit above wasn't set. If this kernel is still running
+ * as a HV guest, then the HV has decided not to advertize
+ * that CPUID bit for whatever reason. For example, one
+ * member of the migration pool might be vulnerable. Which
+ * means, the bug is present: set the BUG flag and return.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
+ return;
+ }
+
+ /*
+ * Zen2 CPUs also have this behaviour, but no CPUID bit.
+ * 0x18 is the respective family for Hygon.
+ */
+ if ((c->x86 == 0x17 || c->x86 == 0x18) &&
+ detect_null_seg_behavior())
+ return;
+
+ /* All the remaining ones are affected */
+ set_cpu_bug(c, X86_BUG_NULL_SEG);
}
-static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+static void generic_identify(struct cpuinfo_x86 *c)
{
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
+ if (!cpuid_feature())
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
+ if (!cpuid_feature())
return;
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
- if (c->cpuid_level >= 0x00000001) {
- c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-# else
- c->apicid = c->initial_apicid;
-# endif
-#endif
- c->phys_proc_id = c->initial_apicid;
- }
+ get_cpu_address_sizes(c);
get_model_name(c); /* Default name */
- detect_nopl(c);
+ /*
+ * ESPFIX is a strange bug. All real CPUs have it. Paravirt
+ * systems that run Linux at CPL > 0 may or may not have the
+ * issue, but, even if they have the issue, there's absolutely
+ * nothing we can do about it because we can't use the real IRET
+ * instruction.
+ *
+ * NB: For the time being, only 32-bit kernels support
+ * X86_BUG_ESPFIX as such. 64-bit kernels directly choose
+ * whether to apply espfix using paravirt hooks. If any
+ * non-paravirt system ever shows up that does *not* have the
+ * ESPFIX issue, we can change this.
+ */
+#ifdef CONFIG_X86_32
+ set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
}
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
+static void identify_cpu(struct cpuinfo_x86 *c)
{
int i;
c->loops_per_jiffy = loops_per_jiffy;
- c->x86_cache_size = -1;
+ c->x86_cache_size = 0;
c->x86_vendor = X86_VENDOR_UNKNOWN;
- c->x86_model = c->x86_mask = 0; /* So far unknown... */
+ c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -838,22 +1995,26 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_virt_bits = 32;
#endif
c->x86_cache_alignment = c->x86_clflush_size;
- memset(&c->x86_capability, 0, sizeof c->x86_capability);
+ memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);
- /* Clear/Set all flags overriden by options, after probe */
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ /* Clear/Set all flags overridden by options, after probe */
+ apply_forced_caps(c);
-#ifdef CONFIG_X86_64
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
-#endif
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
/*
* Vendor-specific initialization. In this section we
@@ -868,12 +2029,21 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
- /* Set up SMEP/SMAP */
setup_smep(c);
setup_smap(c);
+ setup_umip(c);
+ setup_lass(c);
+
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
/*
* The vendor-specific functions might have changed features.
@@ -883,6 +2053,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
/* Filter out anything that depends on CPUID levels we don't have */
filter_cpuid_features(c, true);
+ /* Check for unmet dependencies based on the CPUID dependency table */
+ check_cpufeature_deps(c);
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
@@ -895,21 +2068,15 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
-#ifdef CONFIG_X86_64
- detect_ht(c);
-#endif
-
- init_hypervisor(c);
x86_init_rdrand(c);
+ setup_pku(c);
+ setup_cet(c);
/*
- * Clear/Set all flags overriden by options, need do it
+ * Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ apply_forced_caps(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
@@ -927,102 +2094,83 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[i] |= boot_cpu_data.x86_capability[i];
}
+ ppin_init(c);
+
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
-#endif
}
-#ifdef CONFIG_X86_64
-static void vgetcpu_set_mode(void)
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
{
- if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
- vgetcpu_mode = VGETCPU_RDTSCP;
- else
- vgetcpu_mode = VGETCPU_LSL;
+ struct tss_struct *tss;
+ int cpu;
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ return;
+
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss_rw, cpu);
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1);
+ wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
+ wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32);
+
+ put_cpu();
}
#endif
-void __init identify_boot_cpu(void)
+static __init void identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_amd_e400_c1e_mask();
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
-#else
- vgetcpu_set_mode();
#endif
cpu_detect_tlb(&boot_cpu_data);
+ setup_cr_pinning();
+
+ tsx_init();
+ tdx_init();
+ lkgs_init();
}
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+void identify_secondary_cpu(unsigned int cpu)
{
- BUG_ON(c == &boot_cpu_data);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
+ c->cpu_index = cpu;
+
identify_cpu(c);
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
-}
-
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] __cpuinitconst = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __cpuinit __print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
+ x86_spec_ctrl_setup_ap();
+ update_srbds_msr();
+ if (boot_cpu_has_bug(X86_BUG_GDS))
+ update_gds_msr();
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
-
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_safe(index, &val))
- continue;
- printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr __cpuinitdata;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
-static __init int setup_noclflush(char *arg)
-{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
- return 1;
+ tsx_ap_init();
+ c->initialized = true;
}
-__setup("noclflush", setup_noclflush);
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
+void print_cpu_info(struct cpuinfo_x86 *c)
{
const char *vendor = NULL;
@@ -1034,347 +2182,456 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
}
if (vendor && !strstr(c->x86_model_id, vendor))
- printk(KERN_CONT "%s ", vendor);
+ pr_cont("%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", strim(c->x86_model_id));
+ pr_cont("%s", c->x86_model_id);
else
- printk(KERN_CONT "%d86", c->x86);
+ pr_cont("%d86", c->x86);
- printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+ pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
- if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ pr_cont(", stepping: 0x%x)\n", c->x86_stepping);
else
- printk(KERN_CONT ")\n");
-
- print_cpu_msr(c);
+ pr_cont(")\n");
}
-void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c)
+/*
+ * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param().
+ * These dummy functions prevent them from becoming an environment variable for
+ * init.
+ */
+
+static __init int setup_clearcpuid(char *arg)
{
- if (c->cpu_index < show_msr)
- __print_cpu_msr();
+ return 1;
}
+__setup("clearcpuid=", setup_clearcpuid);
-static __init int setup_disablecpuid(char *arg)
+static __init int setup_setcpuid(char *arg)
{
- int bit;
-
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
-
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("setcpuid=", setup_setcpuid);
-#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) debug_idt_table };
+DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+EXPORT_PER_CPU_SYMBOL(const_current_task);
-DEFINE_PER_CPU_FIRST(union irq_stack_union,
- irq_stack_union) __aligned(PAGE_SIZE);
+DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
+#ifdef CONFIG_X86_64
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING
+ * so that this space is reserved in the hot cache section even when the
+ * mitigation is disabled.
*/
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
-DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth);
+EXPORT_PER_CPU_SYMBOL(__x86_call_depth);
-DEFINE_PER_CPU(unsigned int, irq_count) = -1;
+static void wrmsrq_cstar(unsigned long val)
+{
+ /*
+ * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR
+ * is so far ignored by the CPU, but raises a #VE trap in a TDX
+ * guest. Avoid the pointless write on all Intel CPUs.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ wrmsrq(MSR_CSTAR, val);
+}
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+static inline void idt_syscall_init(void)
+{
+ wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
- [DEBUG_STACK - 1] = DEBUG_STKSZ
-};
+ if (ia32_enabled()) {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+ } else {
+ wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore);
+ wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ }
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
- [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+ /*
+ * Flags to clear on syscall; clear as much as possible
+ * to minimize user space-kernel interference.
+ */
+ wrmsrq(MSR_SYSCALL_MASK,
+ X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
+ X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
+ X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
+ X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
+ X86_EFLAGS_AC|X86_EFLAGS_ID);
+}
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
+ /* The default user and kernel segments */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
/*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+ * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+ * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+ * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+ * instruction to return to ring 3 (both sysexit and sysret cause
+ * #UD when FRED is enabled).
*/
- wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_syscall_init();
+}
+#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+#ifdef CONFIG_STACKPROTECTOR
+DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
+EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
+#endif
#endif
- /* Flags to clear on syscall */
- wrmsrl(MSR_SYSCALL_MASK,
- X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC);
+static void initialize_debug_regs(void)
+{
+ /* Control register first -- to make sure everything is disabled. */
+ set_debugreg(DR7_FIXED_1, 7);
+ set_debugreg(DR6_RESERVED, 6);
+ /* dr5 and dr4 don't exist */
+ set_debugreg(0, 3);
+ set_debugreg(0, 2);
+ set_debugreg(0, 1);
+ set_debugreg(0, 0);
}
+#ifdef CONFIG_KGDB
/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
*/
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
-DEFINE_PER_CPU(int, debug_stack_usage);
-
-int is_debug_stack(unsigned long addr)
+static void dbg_restore_debug_regs(void)
{
- return __get_cpu_var(debug_stack_usage) ||
- (addr <= __get_cpu_var(debug_stack_addr) &&
- addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
-DEFINE_PER_CPU(u32, debug_idt_ctr);
-
-void debug_stack_set_zero(void)
+static inline void setup_getcpu(int cpu)
{
- this_cpu_inc(debug_idt_ctr);
- load_current_idt();
+ unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
+ struct desc_struct d = { };
+
+ if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID))
+ wrmsrq(MSR_TSC_AUX, cpudata);
+
+ /* Store CPU and node number in limit. */
+ d.limit0 = cpudata;
+ d.limit1 = cpudata >> 16;
+
+ d.type = 5; /* RO data, expand down, accessed */
+ d.dpl = 3; /* Visible to user code */
+ d.s = 1; /* Not a system segment */
+ d.p = 1; /* Present */
+ d.d = 1; /* 32-bit */
+
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
-void debug_stack_reset(void)
+#ifdef CONFIG_X86_64
+static inline void tss_setup_ist(struct tss_struct *tss)
{
- if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
- return;
- if (this_cpu_dec_return(debug_idt_ctr) == 0)
- load_current_idt();
+ /* Set up the per-CPU TSS IST stacks */
+ tss->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
+ tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
+ tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
+ tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* Only mapped when SEV-ES is active */
+ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
}
+#else /* CONFIG_X86_64 */
+static inline void tss_setup_ist(struct tss_struct *tss) { }
+#endif /* !CONFIG_X86_64 */
-#else /* CONFIG_X86_64 */
-
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+static inline void tss_setup_io_bitmap(struct tss_struct *tss)
+{
+ tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
-#ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+#ifdef CONFIG_X86_IOPL_IOPERM
+ tss->io_bitmap.prev_max = 0;
+ tss->io_bitmap.prev_sequence = 0;
+ memset(tss->io_bitmap.bitmap, 0xff, sizeof(tss->io_bitmap.bitmap));
+ /*
+ * Invalidate the extra array entry past the end of the all
+ * permission bitmap as required by the hardware.
+ */
+ tss->io_bitmap.mapall[IO_BITMAP_LONGS] = ~0UL;
#endif
-
-#endif /* CONFIG_X86_64 */
+}
/*
- * Clear all 6 debug registers:
+ * Setup everything needed to handle exceptions from the IDT, including the IST
+ * exceptions which use paranoid_entry().
*/
-static void clear_all_debug_regs(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
- int i;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ int cpu = raw_smp_processor_id();
- for (i = 0; i < 8; i++) {
- /* Ignore db4, db5 */
- if ((i == 4) || (i == 5))
- continue;
+ /* paranoid_entry() gets the CPU number from the GDT */
+ setup_getcpu(cpu);
+
+ /* For IDT mode, IST vectors need to be set in TSS. */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
- set_debugreg(0, i);
+ load_TR_desc();
+
+ /* GHCB needs to be setup to handle #VC. */
+ setup_ghcb();
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
}
}
-#ifdef CONFIG_KGDB
-/*
- * Restore debug regs if using kgdbwait and you have a kernel debugger
- * connection established.
- */
-static void dbg_restore_debug_regs(void)
+void __init cpu_init_replace_early_idt(void)
{
- if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
- arch_kgdb_ops.correct_hw_break();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ cpu_init_fred_exceptions();
+ else
+ idt_setup_early_pf();
}
-#else /* ! CONFIG_KGDB */
-#define dbg_restore_debug_regs()
-#endif /* ! CONFIG_KGDB */
/*
* cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init for 64 bit
+ * initialized (naturally) in the bootstrap process, such as the GDT. We
+ * reload it nevertheless, this function acts as a 'CPU state barrier',
+ * nothing should get across.
*/
-#ifdef CONFIG_X86_64
-
-void __cpuinit cpu_init(void)
+void cpu_init(void)
{
- struct orig_ist *oist;
- struct task_struct *me;
- struct tss_struct *t;
- unsigned long v;
- int cpu;
- int i;
-
- /*
- * Load microcode on this cpu if a valid microcode is available.
- * This is early microcode loading procedure.
- */
- load_ucode_ap();
-
- cpu = stack_smp_processor_id();
- t = &per_cpu(init_tss, cpu);
- oist = &per_cpu(orig_ist, cpu);
+ struct task_struct *cur = current;
+ int cpu = raw_smp_processor_id();
#ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 &&
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
#endif
+ pr_debug("Initializing CPU#%d\n", cpu);
- me = current;
+ if (IS_ENABLED(CONFIG_X86_64) || cpu_feature_enabled(X86_FEATURE_VME) ||
+ boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
- panic("CPU#%d already initialized!\n", cpu);
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ loadsegment(fs, 0);
+ memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+ syscall_init();
- pr_debug("Initializing CPU#%d\n", cpu);
+ wrmsrq(MSR_FS_BASE, 0);
+ wrmsrq(MSR_KERNEL_GS_BASE, 0);
+ barrier();
+
+ x2apic_setup();
+
+ intel_posted_msi_init();
+ }
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ mmgrab(&init_mm);
+ cur->active_mm = &init_mm;
+ BUG_ON(cur->mm);
+ initialize_tlbstate_and_flush();
+ enter_lazy_tlb(&init_mm, cur);
/*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
+ * sp0 points to the entry trampoline stack regardless of what task
+ * is running.
*/
+ load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
- switch_to_new_gdt(cpu);
- loadsegment(fs, 0);
+ load_mm_ldt(&init_mm);
- load_current_idt();
+ initialize_debug_regs();
+ dbg_restore_debug_regs();
- memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
- syscall_init();
+ doublefault_init_cpu_tss();
- wrmsrl(MSR_FS_BASE, 0);
- wrmsrl(MSR_KERNEL_GS_BASE, 0);
- barrier();
+ if (is_uv_system())
+ uv_cpu_init();
- x86_configure_nx();
- enable_x2apic();
+ load_fixmap_gdt(cpu);
+}
- /*
- * set up and load the per-CPU TSS
- */
- if (!oist->ist[0]) {
- char *estacks = per_cpu(exception_stacks, cpu);
-
- for (v = 0; v < N_EXCEPTION_STACKS; v++) {
- estacks += exception_stack_sizes[v];
- oist->ist[v] = t->x86_tss.ist[v] =
- (unsigned long)estacks;
- if (v == DEBUG_STACK-1)
- per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
- }
- }
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
+{
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
- /*
- * <= is required because the CPU will access up to
- * 8 bits beyond the end of the IO permission bitmap.
- */
- for (i = 0; i <= IO_BITMAP_LONGS; i++)
- t->io_bitmap[i] = ~0UL;
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
+}
- atomic_inc(&init_mm.mm_count);
- me->active_mm = &init_mm;
- BUG_ON(me->mm);
- enter_lazy_tlb(&init_mm, me);
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds and CPU hotplug lock.
+ *
+ * Return: None
+ */
+void microcode_check(struct cpuinfo_x86 *prev_info)
+{
+ struct cpuinfo_x86 curr_info;
- load_sp0(t, &current->thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ perf_check_microcode();
- clear_all_debug_regs();
- dbg_restore_debug_regs();
+ amd_check_microcode();
- fpu_init();
+ store_cpu_caps(&curr_info);
- if (is_uv_system())
- uv_cpu_init();
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
+ return;
+
+ pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+ pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
}
+#endif
-#else
+/*
+ * Invoked from core CPU hotplug code after hotplug operations
+ */
+void arch_smt_update(void)
+{
+ /* Handle the speculative execution misfeatures */
+ cpu_bugs_smt_update();
+ /* Check whether IPI broadcasting can be enabled */
+ apic_smt_update();
+}
-void __cpuinit cpu_init(void)
+void __init arch_cpu_finalize_init(void)
{
- int cpu = smp_processor_id();
- struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = &curr->thread;
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+
+ identify_boot_cpu();
+
+ select_idle_routine();
- show_ucode_info_early();
+ /*
+ * identify_boot_cpu() initialized SMT support information, let the
+ * core code know.
+ */
+ cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core);
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;)
- local_irq_enable();
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
}
- printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+ cpu_select_mitigations();
+
+ arch_smt_update();
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ /*
+ * Check whether this is a real i386 which is not longer
+ * supported and fixup the utsname.
+ */
+ if (boot_cpu_data.x86 < 4)
+ panic("Kernel requires i486+ for 'invlpg' and other features");
- load_current_idt();
- switch_to_new_gdt(cpu);
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ }
/*
- * Set up and load the per-CPU TSS and LDT
+ * Must be before alternatives because it might set or clear
+ * feature bits.
*/
- atomic_inc(&init_mm.mm_count);
- curr->active_mm = &init_mm;
- BUG_ON(curr->mm);
- enter_lazy_tlb(&init_mm, curr);
+ fpu__init_system();
+ fpu__init_cpu();
- load_sp0(t, thread);
- set_tss_desc(cpu, t);
- load_TR_desc();
- load_LDT(&init_mm.context);
+ /*
+ * This needs to follow the FPU initializtion, since EFI depends on it.
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
+ efi_enter_virtual_mode();
- t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+ /*
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
+ */
+ *c = boot_cpu_data;
+ c->initialized = true;
-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ alternative_instructions();
- clear_all_debug_regs();
- dbg_restore_debug_regs();
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ USER_PTR_MAX = TASK_SIZE_MAX;
- fpu_init();
-}
-#endif
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-void warn_pre_alternatives(void)
-{
- WARN(1, "You're using static_cpu_has before alternatives have run!\n");
-}
-EXPORT_SYMBOL_GPL(warn_pre_alternatives);
-#endif
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+ } else {
+ fpu__init_check_bugs();
+ }
-inline bool __static_cpu_has_safe(u16 bit)
-{
- return boot_cpu_has(bit);
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
}
-EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4041c24ae7db..5c7a3a71191a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,11 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
-struct cpu_model_info {
- int vendor;
- int family;
- const char *model_names[16];
-};
+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
/* attempt to consolidate cpu attributes */
struct cpu_dev {
@@ -14,33 +14,81 @@ struct cpu_dev {
/* some have two possibilities for cpuid string */
const char *c_ident[2];
- struct cpu_model_info c_models[4];
-
void (*c_early_init)(struct cpuinfo_x86 *);
void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
int c_x86_vendor;
-};
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
-struct _tlb_table {
- unsigned char descriptor;
- char tlb_type;
- unsigned int entries;
- /* unsigned int ways; */
- char info[128];
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
};
#define cpu_dev_register(cpu_devX) \
static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
- __attribute__((__section__(".x86_cpu_dev.init"))) = \
+ __section(".x86_cpu_dev.init") = \
&cpu_devX;
extern const struct cpu_dev *const __x86_cpu_dev_start[],
*const __x86_cpu_dev_end[];
+#ifdef CONFIG_CPU_SUP_INTEL
+extern void __init tsx_init(void);
+void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
+#else
+static inline void tsx_init(void) { }
+static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
+#endif /* CONFIG_CPU_SUP_INTEL */
+
+extern void init_spectral_chicken(struct cpuinfo_x86 *c);
+
extern void get_cpu_cap(struct cpuinfo_x86 *c);
+extern void get_cpu_address_sizes(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
+
+extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
+
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
+
+#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+struct amd_northbridge *amd_init_l3_cache(int index);
+#else
+static inline struct amd_northbridge *amd_init_l3_cache(int index)
+{
+ return NULL;
+}
+#endif
+
+unsigned int aperfmperf_get_khz(int cpu);
+void cpu_select_mitigations(void);
+
+extern void x86_spec_ctrl_setup_ap(void);
+extern void update_srbds_msr(void);
+extern void update_gds_msr(void);
+
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..146f6f8b0650
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,192 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+static const struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_FXSR, X86_FEATURE_FPU },
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_APX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_CMOV, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMX, X86_FEATURE_FXSR },
+ { X86_FEATURE_MMXEXT, X86_FEATURE_MMX },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XSAVE, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VP2INTERSECT, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
+ { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 },
+ { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
+ { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
+ { X86_FEATURE_XFD, X86_FEATURE_XSAVES },
+ { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
+ { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
+ { X86_FEATURE_FRED, X86_FEATURE_LKGS },
+ { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL },
+ { X86_FEATURE_LASS, X86_FEATURE_SMAP },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
+
+/*
+ * Return the feature "name" if available, otherwise return
+ * the X86_FEATURE_* numerals to make it easier to identify
+ * the feature.
+ */
+static const char *x86_feature_name(unsigned int feature, char *buf)
+{
+ if (x86_cap_flags[feature])
+ return x86_cap_flags[feature];
+
+ snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32);
+
+ return buf;
+}
+
+void check_cpufeature_deps(struct cpuinfo_x86 *c)
+{
+ char feature_buf[16], depends_buf[16];
+ const struct cpuid_dep *d;
+
+ for (d = cpuid_deps; d->feature; d++) {
+ if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) {
+ /*
+ * Only warn about the first unmet dependency on the
+ * first CPU where it is encountered to avoid spamming
+ * the kernel log.
+ */
+ pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n",
+ smp_processor_id(),
+ x86_feature_name(d->feature, feature_buf),
+ x86_feature_name(d->depends, depends_buf));
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c
new file mode 100644
index 000000000000..89bc8db5e9c6
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sizes.h>
+
+#include <asm/cpuid/types.h>
+
+#include "cpu.h"
+
+#define CACHE_ENTRY(_desc, _type, _size) \
+ [_desc] = { \
+ .c_type = (_type), \
+ .c_size = (_size) / SZ_1K, \
+ }
+
+#define TLB_ENTRY(_desc, _type, _entries) \
+ [_desc] = { \
+ .t_type = (_type), \
+ .entries = (_entries), \
+ }
+
+const struct leaf_0x2_table cpuid_0x2_table[256] = {
+ CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */
+ CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */
+ CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */
+ CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */
+
+ TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */
+ TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */
+ TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */
+ TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */
+ TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */
+ TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */
+ TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */
+ TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative
+ * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */
+ TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */
+ TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */
+ TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */
+ TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */
+ TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */
+ TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */
+ TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */
+ TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */
+ TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */
+ TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */
+ TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */
+ TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */
+};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 7582f475b163..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,6 +1,7 @@
-#include <linux/init.h>
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
#include <linux/delay.h>
+#include <linux/isa-dma.h>
#include <linux/pci.h>
#include <asm/dma.h>
#include <linux/io.h>
@@ -9,13 +10,16 @@
#include <linux/timer.h>
#include <asm/pci-direct.h>
#include <asm/tsc.h>
+#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include "cpu.h"
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
@@ -44,7 +48,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
}
}
-static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned long flags;
@@ -59,25 +63,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
+static unsigned char Cx86_dir0_msb = 0;
-static const char __cpuinitconst Cx86_model[][9] = {
+static const char Cx86_model[][9] = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static const char __cpuinitconst Cx486_name[][5] = {
+static const char Cx486_name[][5] = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static const char __cpuinitconst Cx486S_name[][4] = {
+static const char Cx486S_name[][4] = {
"S", "S2", "Se", "S2e"
};
-static const char __cpuinitconst Cx486D_name[][4] = {
+static const char Cx486D_name[][4] = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
-static const char __cpuinitconst cyrix_model_mult1[] = "12??43";
-static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
+static char Cx86_cb[] = "?.5x Core/Bus Clock";
+static const char cyrix_model_mult1[] = "12??43";
+static const char cyrix_model_mult2[] = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -87,7 +91,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445";
* FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
*/
-static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
+static void check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -104,7 +108,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
local_irq_restore(flags);
if (ccr5 & 2) { /* possible wrong calibration done */
- printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+ pr_info("Recalibrating delay loop with SLOP bit reset\n");
calibrate_delay();
c->loops_per_jiffy = loops_per_jiffy;
}
@@ -112,52 +116,52 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __cpuinit set_cx86_reorder(void)
+static void set_cx86_reorder(void)
{
u8 ccr3;
- printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* Load/Store Serialize to mem access disable (=reorder it) */
- setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
/* set load/store serialize from 1GB to 4GB */
ccr3 |= 0xe0;
setCx86(CX86_CCR3, ccr3);
}
-static void __cpuinit set_cx86_memwb(void)
+static void set_cx86_memwb(void)
{
- printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+ pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
/* CCR2 bit 2: unlock NW bit */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
/*
* Configure later MediaGX and/or Geode processor.
*/
-static void __cpuinit geode_configure(void)
+static void geode_configure(void)
{
unsigned long flags;
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
/* FPU fast, DTE cache, Mem bypass */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
set_cx86_memwb();
@@ -166,7 +170,7 @@ static void __cpuinit geode_configure(void)
local_irq_restore(flags);
}
-static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
+static void early_init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir1 = 0;
@@ -185,7 +189,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
+static void init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -212,7 +216,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/* common case step number/rev -- exceptions handled below */
c->x86_model = (dir1 >> 4) + 1;
- c->x86_mask = dir1 & 0xf;
+ c->x86_stepping = dir1 & 0xf;
/* Now cook; the original recipe is by Channing Corn, from Cyrix.
* We do the same thing for each generation: we work out
@@ -253,6 +257,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
@@ -269,7 +274,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* VSA1 we work around however.
*/
- printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+ pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
/* We do this before the PCI layer is running. However we
@@ -287,12 +292,12 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -315,7 +320,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
- setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
+ setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
} else {
/* A 6x86MX - it has the bug. */
set_cpu_bug(c, X86_BUG_COMA);
@@ -333,7 +338,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
switch (dir0_lsn) {
case 0xd: /* either a 486SLC or DLC w/o DEVID */
dir0_msn = 0;
- p = Cx486_name[(cpu_has_fpu ? 1 : 0)];
+ p = Cx486_name[!!boot_cpu_has(X86_FEATURE_FPU)];
break;
case 0xe: /* a 486S A step */
@@ -356,7 +361,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
+static void init_nsc(struct cpuinfo_x86 *c)
{
/*
* There may be GX1 processors in the wild that are branded
@@ -405,7 +410,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
+static void cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
if (c->x86 == 4 && test_cyrix_52div()) {
@@ -427,13 +432,13 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
- printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+ pr_info("Enabling CPUID on Cyrix processor.\n");
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
/* enable MAPEN */
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);
/* enable cpuid */
- setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);
+ setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);
/* disable MAPEN */
setCx86(CX86_CCR3, ccr3);
local_irq_restore(flags);
@@ -441,7 +446,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
}
}
-static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
+static const struct cpu_dev cyrix_cpu_dev = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_early_init = early_init_cyrix,
@@ -452,7 +457,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = {
cpu_dev_register(cyrix_cpu_dev);
-static const struct cpu_dev __cpuinitconst nsc_cpu_dev = {
+static const struct cpu_dev nsc_cpu_dev = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
new file mode 100644
index 000000000000..1976fef2dfe5
--- /dev/null
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include <asm/apic.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static int cpu_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+ struct cpuinfo_x86 *c = per_cpu_ptr(&cpu_info, cpu);
+
+ seq_printf(m, "online: %d\n", cpu_online(cpu));
+ if (!c->initialized)
+ return 0;
+
+ seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid);
+ seq_printf(m, "apicid: 0x%x\n", c->topo.apicid);
+ seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id);
+ seq_printf(m, "die_id: %u\n", c->topo.die_id);
+ seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
+ seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
+ seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
+ seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
+ seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
+ seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
+ seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+ seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg());
+ seq_printf(m, "num_threads: %u\n", __num_threads_per_package);
+ seq_printf(m, "num_cores: %u\n", __num_cores_per_package);
+ seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package);
+ seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core);
+ return 0;
+}
+
+static int cpu_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cpu_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_cpu_ops = {
+ .open = cpu_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_MAX_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_DIEGRP_DOMAIN] = "DieGrp",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static __init int cpu_init_debugfs(void)
+{
+ struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
+ unsigned long id;
+ char name[24];
+
+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
+ dir = debugfs_create_dir("cpus", base);
+ for_each_possible_cpu(id) {
+ sprintf(name, "%lu", id);
+ debugfs_create_file(name, 0444, dir, (void *)id, &dfs_cpu_ops);
+ }
+ return 0;
+}
+late_initcall(cpu_init_debugfs);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..d69757246bde
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ TERTIARY_CTLS_LOW,
+ TERTIARY_CTLS_HIGH,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign, low, high;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ /* All 64 bits of tertiary controls MSR are allowed-1 settings. */
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS3, &low, &high);
+ c->vmx_capability[TERTIARY_CTLS_LOW] = low;
+ c->vmx_capability[TERTIARY_CTLS_HIGH] = high;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+static int __init nosgx(char *str)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
+
+ return 0;
+}
+
+early_param("nosgx", nosgx);
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
+ bool tboot = tboot_enabled();
+ bool enable_vmx;
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (enable_vmx) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
+
+ wrmsrq(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ goto update_sgx;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+
+update_sgx:
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
+ }
+}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
new file mode 100644
index 000000000000..1fda6c3a2b65
--- /dev/null
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Hygon Processor Support for Linux
+ *
+ * Copyright (C) 2018 Chengdu Haiguang IC Design Co., Ltd.
+ *
+ * Author: Pu Wen <puwen@hygon.cn>
+ */
+#include <linux/io.h>
+
+#include <asm/apic.h>
+#include <asm/cpu.h>
+#include <asm/smp.h>
+#include <asm/numa.h>
+#include <asm/cacheinfo.h>
+#include <asm/spec-ctrl.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_NUMA
+/*
+ * To workaround broken NUMA config. Read the comment in
+ * srat_detect_node().
+ */
+static int nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = __apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+static void srat_detect_node(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node;
+ unsigned int apicid = c->topo.apicid;
+
+ node = numa_cpu_node(cpu);
+ if (node == NUMA_NO_NODE)
+ node = c->topo.llc_id;
+
+ /*
+ * On multi-fabric platform (e.g. Numascale NumaChip) a
+ * platform-specific handler needs to be called to fixup some
+ * IDs of the CPU.
+ */
+ if (x86_cpuinit.fixup_cpu_id)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
+ if (!node_online(node)) {
+ /*
+ * Two possibilities here:
+ *
+ * - The CPU is missing memory and no node was created. In
+ * that case try picking one from a nearby CPU.
+ *
+ * - The APIC IDs differ from the HyperTransport node IDs.
+ * Assume they are all increased by a constant offset, but
+ * in the same order as the HT nodeids. If that doesn't
+ * result in a usable node fall back to the path for the
+ * previous case.
+ *
+ * This workaround operates directly on the mapping between
+ * APIC ID and NUMA node, assuming certain relationship
+ * between APIC ID, HT node ID and NUMA topology. As going
+ * through CPU mapping may alter the outcome, directly
+ * access __apicid_to_node[].
+ */
+ int ht_nodeid = c->topo.initial_apicid;
+
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = __apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+#endif
+}
+
+static void bsp_init_hygon(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
+ u64 val;
+
+ rdmsrq(MSR_K7_HWCR, val);
+ if (!(val & BIT(24)))
+ pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
+ }
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
+
+ if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
+ /*
+ * Try to cache the base value so further operations can
+ * avoid RMW. If that faults, do not enable SSBD.
+ */
+ if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) {
+ setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD);
+ setup_force_cpu_cap(X86_FEATURE_SSBD);
+ x86_amd_ls_cfg_ssbd_mask = 1ULL << 10;
+ }
+ }
+
+ resctrl_cpu_detect(c);
+}
+
+static void early_init_hygon(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /*
+ * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states
+ */
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
+ /* Bit 14 indicates the Runtime Average Power Limit interface. */
+ if (c->x86_power & BIT(14))
+ set_cpu_cap(c, X86_FEATURE_RAPL);
+
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /*
+ * ApicID can always be treated as an 8-bit value for Hygon APIC So, we
+ * can safely set X86_FEATURE_EXTD_APICID unconditionally.
+ */
+ if (boot_cpu_has(X86_FEATURE_APIC))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+#endif
+
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+}
+
+static void init_hygon(struct cpuinfo_x86 *c)
+{
+ u64 vm_cr;
+
+ early_init_hygon(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /*
+ * XXX someone from Hygon needs to confirm this DTRT
+ *
+ init_spectral_chicken(c);
+ */
+
+ set_cpu_cap(c, X86_FEATURE_ZEN);
+ set_cpu_cap(c, X86_FEATURE_CPB);
+
+ cpu_detect_cache_sizes(c);
+
+ srat_detect_node(c);
+
+ init_hygon_cacheinfo(c);
+
+ if (cpu_has(c, X86_FEATURE_SVM)) {
+ rdmsrq(MSR_VM_CR, vm_cr);
+ if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) {
+ pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n");
+ clear_cpu_cap(c, X86_FEATURE_SVM);
+ }
+ }
+
+ if (cpu_has(c, X86_FEATURE_XMM2)) {
+ /*
+ * Use LFENCE for execution serialization. On families which
+ * don't have that MSR, LFENCE is already serializing.
+ * msr_set_bit() uses the safe accessors, too, even if the MSR
+ * is not present.
+ */
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
+
+ /* A serializing LFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ }
+
+ /*
+ * Hygon processors have APIC timer running in deep C states.
+ */
+ set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+}
+
+static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k = (ebx >> 16) & mask;
+ tlb_lli_4k = ebx & mask;
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
+ tlb_lld_2m = (eax >> 16) & mask;
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m = tlb_lld_2m >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m = eax & 0xff;
+ } else
+ tlb_lli_2m = eax & mask;
+
+ tlb_lli_4m = tlb_lli_2m >> 1;
+}
+
+static const struct cpu_dev hygon_cpu_dev = {
+ .c_vendor = "Hygon",
+ .c_ident = { "HygonGenuine" },
+ .c_early_init = early_init_hygon,
+ .c_detect_tlb = cpu_detect_tlb_hygon,
+ .c_bsp_init = bsp_init_hygon,
+ .c_init = init_hygon,
+ .c_x86_vendor = X86_VENDOR_HYGON,
+};
+
+cpu_dev_register(hygon_cpu_dev);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 1e7e84a02eba..f3e9219845e8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,17 +21,16 @@
*
*/
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-/*
- * Hypervisor detect order. This is specified explicitly here because
- * some hypervisors might implement compatibility modes for other
- * hypervisors and therefore need to be detected in specific sequence.
- */
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
+#ifdef CONFIG_XEN_PV
+ &x86_hyper_xen_pv,
+#endif
#ifdef CONFIG_XEN_PVHVM
&x86_hyper_xen_hvm,
#endif
@@ -40,49 +39,74 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
#ifdef CONFIG_KVM_GUEST
&x86_hyper_kvm,
#endif
+#ifdef CONFIG_JAILHOUSE_GUEST
+ &x86_hyper_jailhouse,
+#endif
+#ifdef CONFIG_ACRN_GUEST
+ &x86_hyper_acrn,
+#endif
+#ifdef CONFIG_BHYVE_GUEST
+ &x86_hyper_bhyve,
+#endif
};
-const struct hypervisor_x86 *x86_hyper;
-EXPORT_SYMBOL(x86_hyper);
+enum x86_hypervisor_type x86_hyper_type;
+EXPORT_SYMBOL(x86_hyper_type);
+
+bool __initdata nopv;
+static __init int parse_nopv(char *arg)
+{
+ nopv = true;
+ return 0;
+}
+early_param("nopv", parse_nopv);
-static inline void __init
+static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
- const struct hypervisor_x86 *h, * const *p;
+ const struct hypervisor_x86 *h = NULL, * const *p;
+ uint32_t pri, max_pri = 0;
for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
- h = *p;
- if (h->detect()) {
- x86_hyper = h;
- printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
- break;
+ if (unlikely(nopv) && !(*p)->ignore_nopv)
+ continue;
+
+ pri = (*p)->detect();
+ if (pri > max_pri) {
+ max_pri = pri;
+ h = *p;
}
}
+
+ if (h)
+ pr_info("Hypervisor detected: %s\n", h->name);
+
+ return h;
}
-void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
+static void __init copy_array(const void *src, void *target, unsigned int size)
{
- if (x86_hyper && x86_hyper->set_cpu_features)
- x86_hyper->set_cpu_features(c);
+ unsigned int i, n = size / sizeof(void *);
+ const void * const *from = (const void * const *)src;
+ const void **to = (const void **)target;
+
+ for (i = 0; i < n; i++)
+ if (from[i])
+ to[i] = from[i];
}
void __init init_hypervisor_platform(void)
{
+ const struct hypervisor_x86 *h;
- detect_hypervisor_vendor();
+ h = detect_hypervisor_vendor();
- if (!x86_hyper)
+ if (!h)
return;
- init_hypervisor(&boot_cpu_data);
-
- if (x86_hyper->init_platform)
- x86_hyper->init_platform();
-}
+ copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
+ copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));
-bool __init hypervisor_x2apic_available(void)
-{
- return x86_hyper &&
- x86_hyper->x2apic_available &&
- x86_hyper->x2apic_available();
+ x86_hyper_type = h->type;
+ x86_init.hyper.init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9b0c441c03f5..98ae4c37c93e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,58 +1,225 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
+// SPDX-License-Identifier: GPL-2.0
-#include <linux/string.h>
#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/msr.h>
-#include <asm/bugs.h>
-#include <asm/cpu.h>
+#include <linux/string.h>
+#include <linux/types.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
#endif
+#include <asm/bugs.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu.h>
+#include <asm/cpuid/api.h>
+#include <asm/hwcap2.h>
+#include <asm/intel-family.h>
+#include <asm/microcode.h>
+#include <asm/msr.h>
+#include <asm/numa.h>
+#include <asm/resctrl.h>
+#include <asm/thermal.h>
+#include <asm/uaccess.h>
+
#include "cpu.h"
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#endif
+/*
+ * Processors which have self-snooping capability can handle conflicting
+ * memory type across CPUs by snooping its own cache. However, there exists
+ * CPU models in which having conflicting memory types still leads to
+ * unpredictable behavior, machine check errors, or hangs. Clear this
+ * feature to prevent its use on machines with known erratas.
+ */
+static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
+ setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
+ }
+}
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
{
- u64 misc_enable;
+ ring3mwait_disabled = true;
+ return 1;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- c->cpuid_level = cpuid_eax(0);
- get_cpu_cap(c);
- }
+ if (ring3mwait_disabled)
+ return;
+
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
+/*
+ * Early microcode releases for the Spectre v2 mitigation were broken.
+ * Information taken from;
+ * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf
+ * - https://kb.vmware.com/s/article/52345
+ * - Microcode revisions observed in the wild
+ * - Release note from 20180108 microcode release
+ */
+struct sku_microcode {
+ u32 vfm;
+ u8 stepping;
+ u32 microcode;
+};
+static const struct sku_microcode spectre_bad_microcodes[] = {
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
+ /* Observed in the wild */
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
+};
+
+static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ /*
+ * We know that the hypervisor lie to us on the microcode version so
+ * we may as well hope that it is running the correct version.
+ */
+ if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
+ c->x86_stepping == spectre_bad_microcodes[i].stepping)
+ return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
+ return false;
+}
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
- if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
- unsigned lower_word;
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* Required by the SDM */
- sync_core();
- rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
+static void detect_tme_early(struct cpuinfo_x86 *c)
+{
+ u64 tme_activate;
+ int keyid_bits;
+
+ rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ clear_cpu_cap(c, X86_FEATURE_TME);
+ return;
+ }
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
+
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
+
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN)
+ return;
+
+ /*
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
+ */
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
+
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
+
+ /* Now if any of them are set, check the blacklist and clear the lot */
+ if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
+ cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
+ cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
+ cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
+ pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
+ setup_clear_cpu_cap(X86_FEATURE_IBRS);
+ setup_clear_cpu_cap(X86_FEATURE_IBPB);
+ setup_clear_cpu_cap(X86_FEATURE_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL);
+ setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
+ setup_clear_cpu_cap(X86_FEATURE_SSBD);
+ setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD);
}
/*
@@ -63,9 +230,9 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
- printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+ pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
}
@@ -78,8 +245,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
#endif
/* CPUID workaround for 0F33/0F34 CPU */
- if (c->x86 == 0xF && c->x86_model == 0x3
- && (c->x86_mask == 0x3 || c->x86_mask == 0x4))
+ if (c->x86_vfm == INTEL_P4_PRESCOTT &&
+ (c->x86_stepping == 0x3 || c->x86_stepping == 0x4))
c->x86_phys_bits = 36;
/*
@@ -88,72 +255,91 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*
* It is also reliable across cores and sockets. (but not across
* cabinets - we turn it off in that case explicitly.)
+ *
+ * Use a model-specific check for some older CPUs that have invariant
+ * TSC but may not report it architecturally via 8000_0007.
*/
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (!check_tsc_unstable())
- sched_clock_stable = 1;
+ } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) ||
+ (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
- if (c->x86 == 6) {
- switch (c->x86_model) {
- case 0x27: /* Penwell */
- case 0x35: /* Cloverview */
- set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
- break;
- default:
- break;
- }
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
+ *
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
-#ifdef CONFIG_KMEMCHECK
/*
- * P4s have a "fast strings" feature which causes single-
- * stepping REP instructions to only generate a #DB on
- * cache-line boundaries.
+ * Modern CPUs are generally expected to have a sane fast string
+ * implementation. However, BIOSes typically have a knob to tweak
+ * the architectural MISC_ENABLE.FAST_STRING enable bit.
*
- * Ingo Molnar reported a Pentium D (model 6) and a Xeon
- * (model 2) with the same problem.
+ * Adhere to the preference and program the Linux-defined fast
+ * string flag and enhanced fast string capabilities accordingly.
*/
- if (c->x86 == 15) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
+ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) {
+ rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ /* X86_FEATURE_ERMS is set based on CPUID */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ } else {
+ pr_info("Disabled fast string operations\n");
+ setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+ setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
-#endif
/*
- * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
- * clear the fast string and enhanced fast string CPU capabilities.
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
+ * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
+ * to be modified.
*/
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
- printk(KERN_INFO "Disabled fast string operations\n");
- setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
- setup_clear_cpu_cap(X86_FEATURE_ERMS);
- }
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
}
+
+ check_memory_type_self_snoop_errata(c);
+
+ /*
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
+ */
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
+}
+
+static void bsp_init_intel(struct cpuinfo_x86 *c)
+{
+ resctrl_cpu_detect(c);
}
#ifdef CONFIG_X86_32
@@ -163,20 +349,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* This is called before we do cpu ident work
*/
-int __cpuinit ppro_with_ram_bug(void)
+int ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask < 8) {
- printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+ if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping < 8) {
+ pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
return 1;
}
return 0;
}
-static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+static void intel_smp_check(struct cpuinfo_x86 *c)
{
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
@@ -185,9 +369,8 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
/*
* Mask B, Pentium, but not Pentium MMX
*/
- if (c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX &&
+ c->x86_stepping >= 1 && c->x86_stepping <= 4) {
/*
* Remember we have B step Pentia with bugs
*/
@@ -196,23 +379,30 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
}
}
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
- * All current models of Pentium and Pentium with MMX technology CPUs
+ * All models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
* system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (!paravirt_enabled() && c->x86 == 5) {
+ if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
if (!f00f_workaround_enabled) {
- printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+ pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
}
}
@@ -222,20 +412,30 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
* model 3 mask 3
*/
- if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+ if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) ||
+ c->x86_vfm < INTEL_PENTIUM_II_KLAMATH)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
- * P4 Xeon errata 037 workaround.
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ pr_warn("PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
+ * P4 Xeon erratum 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
- if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+ if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) {
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n");
}
}
@@ -245,42 +445,31 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
* integrated APIC (see 11AP erratum in "Pentium Processor
* Specification Update").
*/
- if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
- (c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
-
+ if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 &&
+ (c->x86_stepping < 0x6 || c->x86_stepping == 0xb))
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
/*
- * Set up the preferred alignment for movsl bulk memory moves
+ * MOVSL bulk memory moves can be slow when source and dest are not
+ * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment.
+ *
+ * Set the preferred alignment for Pentium Pro and newer processors, as
+ * it has only been tested on these.
*/
- switch (c->x86) {
- case 4: /* 486: untested */
- break;
- case 5: /* Old Pentia: untested */
- break;
- case 6: /* PII/PIII only like movsl with 8-byte alignment */
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO)
movsl_mask.mask = 7;
- break;
- case 15: /* P4 is OK down to 8-byte alignment */
- movsl_mask.mask = 7;
- break;
- }
-#endif
-
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
#endif
intel_smp_check(c);
}
#else
-static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+static void intel_workarounds(struct cpuinfo_x86 *c)
{
}
#endif
-static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
unsigned node;
@@ -297,78 +486,61 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
- unsigned int eax, ebx, ecx, edx;
+ u64 msr;
- if (c->cpuid_level < 4)
- return 1;
-
- /* Intel has a non-standard dependency on %ecx for this CPUID level. */
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- return (eax >> 26) + 1;
- else
- return 1;
+ if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
}
-static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
+ u64 msr;
+
+ if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrq(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
- unsigned int l2 = 0;
+/*
+ * This is a list of Intel CPUs that are known to suffer from downclocking when
+ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel
+ * executes SIMD-optimized code such as cryptography functions or CRCs, it
+ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0),
+ X86_MATCH_VFM(INTEL_TIGERLAKE, 0),
+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+ {},
+};
+static void init_intel(struct cpuinfo_x86 *c)
+{
early_init_intel(c);
intel_workarounds(c);
- /*
- * Detect the extended topology information if available. This
- * will reinitialise the initial_apicid which will be used
- * in init_intel_cacheinfo()
- */
- detect_extended_topology(c);
+ init_intel_cacheinfo(c);
- l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
@@ -376,25 +548,33 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- if (cpu_has_xmm2)
+ if (cpu_has(c, X86_FEATURE_XMM2))
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- if (cpu_has_ds) {
- unsigned int l1;
+
+ if (boot_cpu_has(X86_FEATURE_DS)) {
+ unsigned int l1, l2;
+
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
+
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
+ set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
#else
/*
* Names for the Pentium II/Celeron processors
@@ -402,6 +582,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
* Dixon is NOT a Celeron.
*/
if (c->x86 == 6) {
+ unsigned int l2 = c->x86_cache_size;
char *p = NULL;
switch (c->x86_model) {
@@ -415,7 +596,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
- else if (c->x86_mask == 0 || c->x86_mask == 5)
+ else if (c->x86_stepping == 0 || c->x86_stepping == 5)
p = "Celeron-A";
break;
@@ -428,51 +609,25 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
+ if (x86_match_cpu(zmm_exclusion_list))
+ set_cpu_cap(c, X86_FEATURE_PREFER_YMM);
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
- /*
- * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
- * x86_energy_perf_policy(8) is available to change it at run-time
- */
- if (cpu_has(c, X86_FEATURE_EPB)) {
- u64 epb;
-
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
- printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
- " Set to 'normal', was 'performance'\n"
- "ENERGY_PERF_BIAS: View and update with"
- " x86_energy_perf_policy(8)\n");
- epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
- wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- }
- }
+ init_intel_misc_features(c);
+
+ split_lock_init();
+
+ intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
-static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/*
* Intel PIII Tualatin. This comes in two flavours.
@@ -480,193 +635,98 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
*/
- if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
+ if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0)
size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if (c->x86_vfm == INTEL_QUARK_X1000)
+ size = 16;
return size;
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
-
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
-
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
-
-#define TLB_DATA_1G 0x16
-
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
-
-#define STLB_4K 0x41
-
-static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
- { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
- { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
- { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
- { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
- { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
- { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
- { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
- { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
- { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
- { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
- { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
- { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
- { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
- { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
- { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
- { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
- { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
- { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
- { 0x00, 0, 0 }
-};
-
-static void __cpuinit intel_tlb_lookup(const unsigned char desc)
+static void intel_tlb_lookup(const struct leaf_0x2_table *desc)
{
- unsigned char k;
- if (desc == 0)
- return;
-
- /* look up this descriptor in the table */
- for (k = 0; intel_tlb_table[k].descriptor != desc && \
- intel_tlb_table[k].descriptor != 0; k++)
- ;
+ short entries = desc->entries;
- if (intel_tlb_table[k].tlb_type == 0)
- return;
-
- switch (intel_tlb_table[k].tlb_type) {
+ switch (desc->t_type) {
case STLB_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ break;
+ case STLB_4K_2M:
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_INST_ALL:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_4K:
- if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4k = max(tlb_lli_4k, entries);
break;
case TLB_INST_4M:
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_INST_2M_4M:
- if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lli_2m = max(tlb_lli_2m, entries);
+ tlb_lli_4m = max(tlb_lli_4m, entries);
break;
case TLB_DATA_4K:
case TLB_DATA0_4K:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
break;
case TLB_DATA_4M:
case TLB_DATA0_4M:
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_2M_4M:
case TLB_DATA0_2M_4M:
- if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_2m = max(tlb_lld_2m, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
case TLB_DATA_4K_4M:
- if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
- if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
- tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ tlb_lld_4k = max(tlb_lld_4k, entries);
+ tlb_lld_4m = max(tlb_lld_4m, entries);
break;
- }
-}
-
-static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
- switch ((c->x86 << 8) + c->x86_model) {
- case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 0x61d: /* six-core 45 nm xeon "Dunnington" */
- tlb_flushall_shift = -1;
- break;
- case 0x61a: /* 45 nm nehalem, "Bloomfield" */
- case 0x61e: /* 45 nm nehalem, "Lynnfield" */
- case 0x625: /* 32 nm nehalem, "Clarkdale" */
- case 0x62c: /* 32 nm nehalem, "Gulftown" */
- case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
- case 0x62f: /* 32 nm Xeon E7 */
- tlb_flushall_shift = 6;
- break;
- case 0x62a: /* SandyBridge */
- case 0x62d: /* SandyBridge, "Romely-EP" */
- tlb_flushall_shift = 5;
+ case TLB_DATA_1G_2M_4M:
+ tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES);
+ tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES);
+ fallthrough;
+ case TLB_DATA_1G:
+ tlb_lld_1g = max(tlb_lld_1g, entries);
break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 1;
- break;
- default:
- tlb_flushall_shift = 6;
}
}
-static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
+static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
- int i, j, n;
- unsigned int regs[4];
- unsigned char *desc = (unsigned char *)regs;
+ const struct leaf_0x2_table *desc;
+ union leaf_0x2_regs regs;
+ u8 *ptr;
if (c->cpuid_level < 2)
return;
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++)
- intel_tlb_lookup(desc[j]);
- }
- intel_tlb_flushall_shift_set(c);
+ cpuid_leaf_0x2(&regs);
+ for_each_cpuid_0x2_desc(regs, ptr, desc)
+ intel_tlb_lookup(desc);
}
-static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
+static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
@@ -679,7 +739,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[9] = "486 DX/4-WB"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+ { .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
@@ -687,10 +747,11 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
- [8] = "Mobile Pentium MMX"
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+ { .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
@@ -704,7 +765,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
[11] = "Pentium III (Tualatin)",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+ { .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
@@ -714,13 +775,13 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
}
},
},
- .c_size_cache = intel_size_cache,
+ .legacy_cache_size = intel_size_cache,
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
+ .c_bsp_init = bsp_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
deleted file mode 100644
index 8dc72dda66fe..000000000000
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ /dev/null
@@ -1,1248 +0,0 @@
-/*
- * Routines to indentify caches on Intel CPU.
- *
- * Changes:
- * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
- * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
- * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
- */
-
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/pci.h>
-
-#include <asm/processor.h>
-#include <linux/smp.h>
-#include <asm/amd_nb.h>
-#include <asm/smp.h>
-
-#define LVL_1_INST 1
-#define LVL_1_DATA 2
-#define LVL_2 3
-#define LVL_3 4
-#define LVL_TRACE 5
-
-struct _cache_table {
- unsigned char descriptor;
- char cache_type;
- short size;
-};
-
-#define MB(x) ((x) * 1024)
-
-/* All the cache descriptor types we care about (no TLB or
- trace cache entries) */
-
-static const struct _cache_table __cpuinitconst cache_table[] =
-{
- { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */
- { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */
- { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
- { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
- { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
- { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
- { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
- { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
- { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */
- { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */
- { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */
- { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
- { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
- { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
- { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
- { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
- { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
- { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
- { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
- { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
- { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
- { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
- { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
- { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */
- { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */
- { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
- { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
- { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
- { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
- { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
- { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
- { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
- { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
- { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
- { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
- { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
- { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
- { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
- { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
- { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
- { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
- { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
- { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
- { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
- { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
- { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
- { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
- { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
- { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
- { 0x00, 0, 0}
-};
-
-
-enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
-};
-
-union _cpuid4_leaf_eax {
- struct {
- enum _cache_type type:5;
- unsigned int level:3;
- unsigned int is_self_initializing:1;
- unsigned int is_fully_associative:1;
- unsigned int reserved:4;
- unsigned int num_threads_sharing:12;
- unsigned int num_cores_on_die:6;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ebx {
- struct {
- unsigned int coherency_line_size:12;
- unsigned int physical_line_partition:10;
- unsigned int ways_of_associativity:10;
- } split;
- u32 full;
-};
-
-union _cpuid4_leaf_ecx {
- struct {
- unsigned int number_of_sets:32;
- } split;
- u32 full;
-};
-
-struct _cpuid4_info_regs {
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned long size;
- struct amd_northbridge *nb;
-};
-
-struct _cpuid4_info {
- struct _cpuid4_info_regs base;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
-unsigned short num_cache_leaves;
-
-/* AMD doesn't have CPUID4. Emulate it here to report the same
- information to the user. This makes some assumptions about the machine:
- L2 not shared, no SMT etc. that is currently true on AMD CPUs.
-
- In theory the TLBs could be reported as fake type (they are in "dummy").
- Maybe later */
-union l1_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:8;
- unsigned assoc:8;
- unsigned size_in_kb:8;
- };
- unsigned val;
-};
-
-union l2_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned size_in_kb:16;
- };
- unsigned val;
-};
-
-union l3_cache {
- struct {
- unsigned line_size:8;
- unsigned lines_per_tag:4;
- unsigned assoc:4;
- unsigned res:2;
- unsigned size_encoded:14;
- };
- unsigned val;
-};
-
-static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1,
- [2] = 2,
- [4] = 4,
- [6] = 8,
- [8] = 16,
- [0xa] = 32,
- [0xb] = 48,
- [0xc] = 64,
- [0xd] = 96,
- [0xe] = 128,
- [0xf] = 0xffff /* fully associative - no way to show this currently */
-};
-
-static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
-static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 };
-
-static void __cpuinit
-amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
- union _cpuid4_leaf_ebx *ebx,
- union _cpuid4_leaf_ecx *ecx)
-{
- unsigned dummy;
- unsigned line_size, lines_per_tag, assoc, size_in_kb;
- union l1_cache l1i, l1d;
- union l2_cache l2;
- union l3_cache l3;
- union l1_cache *l1 = &l1d;
-
- eax->full = 0;
- ebx->full = 0;
- ecx->full = 0;
-
- cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
- cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val);
-
- switch (leaf) {
- case 1:
- l1 = &l1i;
- case 0:
- if (!l1->val)
- return;
- assoc = assocs[l1->assoc];
- line_size = l1->line_size;
- lines_per_tag = l1->lines_per_tag;
- size_in_kb = l1->size_in_kb;
- break;
- case 2:
- if (!l2.val)
- return;
- assoc = assocs[l2.assoc];
- line_size = l2.line_size;
- lines_per_tag = l2.lines_per_tag;
- /* cpu_data has errata corrections for K7 applied */
- size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
- break;
- case 3:
- if (!l3.val)
- return;
- assoc = assocs[l3.assoc];
- line_size = l3.line_size;
- lines_per_tag = l3.lines_per_tag;
- size_in_kb = l3.size_encoded * 512;
- if (boot_cpu_has(X86_FEATURE_AMD_DCM)) {
- size_in_kb = size_in_kb >> 1;
- assoc = assoc >> 1;
- }
- break;
- default:
- return;
- }
-
- eax->split.is_self_initializing = 1;
- eax->split.type = types[leaf];
- eax->split.level = levels[leaf];
- eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
-
-
- if (assoc == 0xffff)
- eax->split.is_fully_associative = 1;
- ebx->split.coherency_line_size = line_size - 1;
- ebx->split.ways_of_associativity = assoc - 1;
- ebx->split.physical_line_partition = lines_per_tag - 1;
- ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
- (ebx->split.ways_of_associativity + 1) - 1;
-}
-
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
- unsigned int);
-};
-
-#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
-/*
- * L3 cache descriptors
- */
-static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
-{
- struct amd_l3_cache *l3 = &nb->l3_cache;
- unsigned int sc0, sc1, sc2, sc3;
- u32 val = 0;
-
- pci_read_config_dword(nb->misc, 0x1C4, &val);
-
- /* calculate subcache sizes */
- l3->subcaches[0] = sc0 = !(val & BIT(0));
- l3->subcaches[1] = sc1 = !(val & BIT(4));
-
- if (boot_cpu_data.x86 == 0x15) {
- l3->subcaches[0] = sc0 += !(val & BIT(1));
- l3->subcaches[1] = sc1 += !(val & BIT(5));
- }
-
- l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
- l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
-
- l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
-}
-
-static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
- int node;
-
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
-
- node = amd_get_nb_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
-}
-
-/*
- * check whether a slot used for disabling an L3 index is occupied.
- * @l3: L3 cache descriptor
- * @slot: slot number (0..1)
- *
- * @returns: the disabled index if used or negative value if slot free.
- */
-int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
-{
- unsigned int reg = 0;
-
- pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg);
-
- /* check whether this slot is activated already */
- if (reg & (3UL << 30))
- return reg & 0xfff;
-
- return -1;
-}
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int slot)
-{
- int index;
-
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
- if (index >= 0)
- return sprintf(buf, "%d\n", index);
-
- return sprintf(buf, "FREE\n");
-}
-
-#define SHOW_CACHE_DISABLE(slot) \
-static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return show_cache_disable(this_leaf, buf, slot); \
-}
-SHOW_CACHE_DISABLE(0)
-SHOW_CACHE_DISABLE(1)
-
-static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
- unsigned slot, unsigned long idx)
-{
- int i;
-
- idx |= BIT(30);
-
- /*
- * disable index in all 4 subcaches
- */
- for (i = 0; i < 4; i++) {
- u32 reg = idx | (i << 20);
-
- if (!nb->l3_cache.subcaches[i])
- continue;
-
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
-
- /*
- * We need to WBINVD on a core on the node containing the L3
- * cache which indices we disable therefore a simple wbinvd()
- * is not sufficient.
- */
- wbinvd_on_cpu(cpu);
-
- reg |= BIT(31);
- pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg);
- }
-}
-
-/*
- * disable a L3 cache index by using a disable-slot
- *
- * @l3: L3 cache descriptor
- * @cpu: A CPU on the node containing the L3 cache
- * @slot: slot number (0..1)
- * @index: index to disable
- *
- * @return: 0 on success, error status on failure
- */
-int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
- unsigned long index)
-{
- int ret = 0;
-
- /* check if @slot is already used or the index is already disabled */
- ret = amd_get_l3_disable_slot(nb, slot);
- if (ret >= 0)
- return -EEXIST;
-
- if (index > nb->l3_cache.indices)
- return -EINVAL;
-
- /* check whether the other slot has disabled the same index already */
- if (index == amd_get_l3_disable_slot(nb, !slot))
- return -EEXIST;
-
- amd_l3_disable_index(nb, cpu, slot, index);
-
- return 0;
-}
-
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
-{
- unsigned long val = 0;
- int cpu, err = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
-
- err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
- if (err) {
- if (err == -EEXIST)
- pr_warning("L3 slot %d in use/index already disabled!\n",
- slot);
- return err;
- }
- return count;
-}
-
-#define STORE_CACHE_DISABLE(slot) \
-static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count, \
- unsigned int cpu) \
-{ \
- return store_cache_disable(this_leaf, buf, count, slot); \
-}
-STORE_CACHE_DISABLE(0)
-STORE_CACHE_DISABLE(1)
-
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
-{
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
- return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
-}
-
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
- unsigned int cpu)
-{
- unsigned long val;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
- if (strict_strtoul(buf, 16, &val) < 0)
- return -EINVAL;
-
- if (amd_set_subcaches(cpu, val))
- return -EINVAL;
-
- return count;
-}
-
-static struct _cache_attr subcaches =
- __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
-
-#else
-#define amd_init_l3_cache(x, y)
-#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
-
-static int
-__cpuinit cpuid4_cache_lookup_regs(int index,
- struct _cpuid4_info_regs *this_leaf)
-{
- union _cpuid4_leaf_eax eax;
- union _cpuid4_leaf_ebx ebx;
- union _cpuid4_leaf_ecx ecx;
- unsigned edx;
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
- if (cpu_has_topoext)
- cpuid_count(0x8000001d, index, &eax.full,
- &ebx.full, &ecx.full, &edx);
- else
- amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_init_l3_cache(this_leaf, index);
- } else {
- cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
- }
-
- if (eax.split.type == CACHE_TYPE_NULL)
- return -EIO; /* better error ? */
-
- this_leaf->eax = eax;
- this_leaf->ebx = ebx;
- this_leaf->ecx = ecx;
- this_leaf->size = (ecx.split.number_of_sets + 1) *
- (ebx.split.coherency_line_size + 1) *
- (ebx.split.physical_line_partition + 1) *
- (ebx.split.ways_of_associativity + 1);
- return 0;
-}
-
-static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c)
-{
- unsigned int eax, ebx, ecx, edx, op;
- union _cpuid4_leaf_eax cache_eax;
- int i = -1;
-
- if (c->x86_vendor == X86_VENDOR_AMD)
- op = 0x8000001d;
- else
- op = 4;
-
- do {
- ++i;
- /* Do cpuid(op) loop to find out num_cache_leaves */
- cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
- cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
- return i;
-}
-
-void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c)
-{
-
- if (cpu_has_topoext) {
- num_cache_leaves = find_num_cache_leaves(c);
- } else if (c->extended_cpuid_level >= 0x80000006) {
- if (cpuid_edx(0x80000006) & 0xf000)
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
- }
-}
-
-unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
-{
- /* Cache sizes */
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
- unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
- unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
- unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
- unsigned int cpu = c->cpu_index;
-#endif
-
- if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves(c);
- is_initialized++;
- }
-
- /*
- * Whenever possible use cpuid(4), deterministic cache
- * parameters cpuid leaf to find the cache details
- */
- for (i = 0; i < num_cache_leaves; i++) {
- struct _cpuid4_info_regs this_leaf = {};
- int retval;
-
- retval = cpuid4_cache_lookup_regs(i, &this_leaf);
- if (retval < 0)
- continue;
-
- switch (this_leaf.eax.split.level) {
- case 1:
- if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
- new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
- new_l1i = this_leaf.size/1024;
- break;
- case 2:
- new_l2 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l2_id = c->apicid & ~((1 << index_msb) - 1);
- break;
- case 3:
- new_l3 = this_leaf.size/1024;
- num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
- index_msb = get_count_order(num_threads_sharing);
- l3_id = c->apicid & ~((1 << index_msb) - 1);
- break;
- default:
- break;
- }
- }
- }
- /*
- * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
- * trace cache
- */
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
- /* supports eax=2 call */
- int j, n;
- unsigned int regs[4];
- unsigned char *dp = (unsigned char *)regs;
- int only_trace = 0;
-
- if (num_cache_leaves != 0 && c->x86 == 15)
- only_trace = 1;
-
- /* Number of times to iterate */
- n = cpuid_eax(2) & 0xFF;
-
- for (i = 0 ; i < n ; i++) {
- cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
-
- /* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
- if (regs[j] & (1 << 31))
- regs[j] = 0;
-
- /* Byte 0 is level count, not a descriptor */
- for (j = 1 ; j < 16 ; j++) {
- unsigned char des = dp[j];
- unsigned char k = 0;
-
- /* look up this descriptor in the table */
- while (cache_table[k].descriptor != 0) {
- if (cache_table[k].descriptor == des) {
- if (only_trace && cache_table[k].cache_type != LVL_TRACE)
- break;
- switch (cache_table[k].cache_type) {
- case LVL_1_INST:
- l1i += cache_table[k].size;
- break;
- case LVL_1_DATA:
- l1d += cache_table[k].size;
- break;
- case LVL_2:
- l2 += cache_table[k].size;
- break;
- case LVL_3:
- l3 += cache_table[k].size;
- break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
- }
-
- break;
- }
-
- k++;
- }
- }
- }
- }
-
- if (new_l1d)
- l1d = new_l1d;
-
- if (new_l1i)
- l1i = new_l1i;
-
- if (new_l2) {
- l2 = new_l2;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l2_id;
-#endif
- }
-
- if (new_l3) {
- l3 = new_l3;
-#ifdef CONFIG_X86_HT
- per_cpu(cpu_llc_id, cpu) = l3_id;
-#endif
- }
-
- c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
-
- return l2;
-}
-
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf;
- int i, sibling;
-
- if (cpu_has_topoext) {
- unsigned int apicid, nshared, first, last;
-
- if (!per_cpu(ici_cpuid4_info, cpu))
- return 0;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
- apicid = cpu_data(cpu).apicid;
- first = apicid - (apicid % nshared);
- last = first + nshared - 1;
-
- for_each_online_cpu(i) {
- apicid = cpu_data(i).apicid;
- if ((apicid < first) || (apicid > last))
- continue;
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
-
- for_each_online_cpu(sibling) {
- apicid = cpu_data(sibling).apicid;
- if ((apicid < first) || (apicid > last))
- continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
- }
- }
- } else if (index == 3) {
- for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
- for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
- if (!cpu_online(sibling))
- continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
- }
- }
- } else
- return 0;
-
- return 1;
-}
-
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- unsigned long num_threads_sharing;
- int index_msb, i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (cache_shared_amd_cpu_map_setup(cpu, index))
- return;
- }
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
-
- if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void __cpuinit free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void __cpuinit get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
-
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
-
- *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
- if (unlikely(*retval < 0)) {
- int i;
-
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
- }
- cache_shared_cpu_map_setup(cpu, j);
- }
-}
-
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
-{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
-}
-
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
-
- if (len > 1) {
- const struct cpumask *mask;
-
- mask = to_cpumask(this_leaf->shared_cpu_map);
- n = type ?
- cpulist_scnprintf(buf, len-2, mask) :
- cpumask_scnprintf(buf, len-2, mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
- return n;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
-
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- switch (this_leaf->base.eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute ** __cpuinit amd_l3_attrs(void)
-{
- static struct attribute **attrs;
- int n;
-
- if (attrs)
- return attrs;
-
- n = ARRAY_SIZE(default_attrs);
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
- if (attrs == NULL)
- return attrs = default_attrs;
-
- for (n = 0; default_attrs[n]; n++)
- attrs[n] = default_attrs[n];
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- attrs[n++] = &cache_disable_0.attr;
- attrs[n++] = &cache_disable_1.attr;
- }
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- attrs[n++] = &subcaches.attr;
-
- return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
- return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
-}
-
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
-
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
- if (this_leaf->base.nb)
- ktype_cache.default_attrs = amd_l3_attrs();
-#endif
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
- }
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
- return 0;
-}
-
-static void __cpuinit cache_remove_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
-
- dev = get_cpu_device(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
- int i;
-
- if (num_cache_leaves == 0)
- return 0;
-
- for_each_online_cpu(i) {
- int err;
- struct device *dev = get_cpu_device(i);
-
- err = cache_add_dev(dev);
- if (err)
- return err;
- }
- register_hotcpu_notifier(&cacheinfo_cpu_notifier);
- return 0;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
new file mode 100644
index 000000000000..2c56f8730f59
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Performance and Energy Bias Hint support.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author:
+ * Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/syscore_ops.h>
+#include <linux/pm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+/**
+ * DOC: overview
+ *
+ * The Performance and Energy Bias Hint (EPB) allows software to specify its
+ * preference with respect to the power-performance tradeoffs present in the
+ * processor. Generally, the EPB is expected to be set by user space (directly
+ * via sysfs or with the help of the x86_energy_perf_policy tool), but there are
+ * two reasons for the kernel to update it.
+ *
+ * First, there are systems where the platform firmware resets the EPB during
+ * system-wide transitions from sleep states back into the working state
+ * effectively causing the previous EPB updates by user space to be lost.
+ * Thus the kernel needs to save the current EPB values for all CPUs during
+ * system-wide transitions to sleep states and restore them on the way back to
+ * the working state. That can be achieved by saving EPB for secondary CPUs
+ * when they are taken offline during transitions into system sleep states and
+ * for the boot CPU in a syscore suspend operation, so that it can be restored
+ * for the boot CPU in a syscore resume operation and for the other CPUs when
+ * they are brought back online. However, CPUs that are already offline when
+ * a system-wide PM transition is started are not taken offline again, but their
+ * EPB values may still be reset by the platform firmware during the transition,
+ * so in fact it is necessary to save the EPB of any CPU taken offline and to
+ * restore it when the given CPU goes back online at all times.
+ *
+ * Second, on many systems the initial EPB value coming from the platform
+ * firmware is 0 ('performance') and at least on some of them that is because
+ * the platform firmware does not initialize EPB at all with the assumption that
+ * the OS will do that anyway. That sometimes is problematic, as it may cause
+ * the system battery to drain too fast, for example, so it is better to adjust
+ * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the
+ * kernel changes it to 6 ('normal').
+ */
+
+static DEFINE_PER_CPU(u8, saved_epb);
+
+#define EPB_MASK 0x0fULL
+#define EPB_SAVED 0x10ULL
+#define MAX_EPB EPB_MASK
+
+enum energy_perf_value_index {
+ EPB_INDEX_PERFORMANCE,
+ EPB_INDEX_BALANCE_PERFORMANCE,
+ EPB_INDEX_NORMAL,
+ EPB_INDEX_BALANCE_POWERSAVE,
+ EPB_INDEX_POWERSAVE,
+};
+
+static u8 energ_perf_values[] = {
+ [EPB_INDEX_PERFORMANCE] = ENERGY_PERF_BIAS_PERFORMANCE,
+ [EPB_INDEX_BALANCE_PERFORMANCE] = ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
+ [EPB_INDEX_NORMAL] = ENERGY_PERF_BIAS_NORMAL,
+ [EPB_INDEX_BALANCE_POWERSAVE] = ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
+ [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE,
+};
+
+static int intel_epb_save(void *data)
+{
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ /*
+ * Ensure that saved_epb will always be nonzero after this write even if
+ * the EPB value read from the MSR is 0.
+ */
+ this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED);
+
+ return 0;
+}
+
+static void intel_epb_restore(void *data)
+{
+ u64 val = this_cpu_read(saved_epb);
+ u64 epb;
+
+ rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if (val) {
+ val &= EPB_MASK;
+ } else {
+ /*
+ * Because intel_epb_save() has not run for the current CPU yet,
+ * it is going online for the first time, so if its EPB value is
+ * 0 ('performance') at this point, assume that it has not been
+ * initialized by the platform firmware and set it to 6
+ * ('normal').
+ */
+ val = epb & EPB_MASK;
+ if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
+ val = energ_perf_values[EPB_INDEX_NORMAL];
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ }
+ }
+ wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
+}
+
+static const struct syscore_ops intel_epb_syscore_ops = {
+ .suspend = intel_epb_save,
+ .resume = intel_epb_restore,
+};
+
+static struct syscore intel_epb_syscore = {
+ .ops = &intel_epb_syscore_ops,
+};
+
+static const char * const energy_perf_strings[] = {
+ [EPB_INDEX_PERFORMANCE] = "performance",
+ [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance",
+ [EPB_INDEX_NORMAL] = "normal",
+ [EPB_INDEX_BALANCE_POWERSAVE] = "balance-power",
+ [EPB_INDEX_POWERSAVE] = "power",
+};
+
+static ssize_t energy_perf_bias_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ unsigned int cpu = dev->id;
+ u64 epb;
+ int ret;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ return sprintf(buf, "%llu\n", epb);
+}
+
+static ssize_t energy_perf_bias_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int cpu = dev->id;
+ u64 epb, val;
+ int ret;
+
+ ret = __sysfs_match_string(energy_perf_strings,
+ ARRAY_SIZE(energy_perf_strings), buf);
+ if (ret >= 0)
+ val = energ_perf_values[ret];
+ else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
+ return -EINVAL;
+
+ ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+ if (ret < 0)
+ return ret;
+
+ ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
+ (epb & ~EPB_MASK) | val);
+ if (ret < 0)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(energy_perf_bias);
+
+static struct attribute *intel_epb_attrs[] = {
+ &dev_attr_energy_perf_bias.attr,
+ NULL
+};
+
+static const struct attribute_group intel_epb_attr_group = {
+ .name = power_group_name,
+ .attrs = intel_epb_attrs
+};
+
+static int intel_epb_online(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ intel_epb_restore(NULL);
+ if (!cpuhp_tasks_frozen)
+ sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ return 0;
+}
+
+static int intel_epb_offline(unsigned int cpu)
+{
+ struct device *cpu_dev = get_cpu_device(cpu);
+
+ if (!cpuhp_tasks_frozen)
+ sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group);
+
+ intel_epb_save(NULL);
+ return 0;
+}
+
+static const struct x86_cpu_id intel_epb_normal[] = {
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ {}
+};
+
+static __init int intel_epb_init(void)
+{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_epb_normal);
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_EPB))
+ return -ENODEV;
+
+ if (id)
+ energ_perf_values[EPB_INDEX_NORMAL] = id->driver_data;
+
+ ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
+ "x86/intel/epb:online", intel_epb_online,
+ intel_epb_offline);
+ if (ret < 0)
+ goto err_out_online;
+
+ register_syscore(&intel_epb_syscore);
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
+ return ret;
+}
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 36565373af87..6af1e8baeb0f 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,11 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
#include <asm/cpu_device_id.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/slab.h>
/**
- * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * x86_match_vendor_cpu_type - helper function to match the hardware defined
+ * cpu-type for a single entry in the x86_cpu_id
+ * table. Note, this function does not match the
+ * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and
+ * TOPO_CPU_TYPE_PERFORMANCE.
+ * @c: Pointer to the cpuinfo_x86 structure of the CPU to match.
+ * @m: Pointer to the x86_cpu_id entry to match against.
+ *
+ * Return: true if the cpu-type matches, false otherwise.
+ */
+static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m)
+{
+ if (m->type == X86_CPU_TYPE_ANY)
+ return true;
+
+ /* Hybrid CPUs are special, they are assumed to match all cpu-types */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return true;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ return m->type == c->topo.intel_type;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ return m->type == c->topo.amd_type;
+
+ return false;
+}
+
+/**
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@@ -15,12 +44,16 @@
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
- * { X86_VENDOR_INTEL, 6, 0x12 }
- * or to match a specific CPU feature
- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -33,59 +66,33 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
+ if (m->steppings != X86_STEPPING_ANY &&
+ !(BIT(c->x86_stepping) & m->steppings))
+ continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
+ if (!x86_match_vendor_cpu_type(c, m))
+ continue;
return m;
}
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);
-ssize_t arch_print_cpu_modalias(struct device *dev,
- struct device_attribute *attr,
- char *bufptr)
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
- int size = PAGE_SIZE;
- int i, n;
- char *buf = bufptr;
+ const struct x86_cpu_id *res = x86_match_cpu(table);
- n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
- "model:%04X:feature:",
- boot_cpu_data.x86_vendor,
- boot_cpu_data.x86,
- boot_cpu_data.x86_model);
- size -= n;
- buf += n;
- size -= 1;
- for (i = 0; i < NCAPINTS*32; i++) {
- if (boot_cpu_has(i)) {
- n = snprintf(buf, size, ",%04X", i);
- if (n >= size) {
- WARN(1, "x86 features overflow page\n");
- break;
- }
- size -= n;
- buf += n;
- }
- }
- *buf++ = '\n';
- return buf - bufptr;
-}
+ if (!res || res->driver_data > boot_cpu_data.microcode)
+ return false;
-int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
- char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (buf) {
- arch_print_cpu_modalias(NULL, NULL, buf);
- add_uevent_var(env, "MODALIAS=%s", buf);
- kfree(buf);
- }
- return 0;
+ return true;
}
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/Makefile b/arch/x86/kernel/cpu/mce/Makefile
new file mode 100644
index 000000000000..015856abdbb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y = core.o severity.o genpool.o
+
+obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
+obj-$(CONFIG_X86_MCE_INTEL) += intel.o
+obj-$(CONFIG_X86_MCE_AMD) += amd.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
+
+mce-inject-y := inject.o
+obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
+
+obj-$(CONFIG_ACPI_APEI) += apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
new file mode 100644
index 000000000000..3f1dda355307
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -0,0 +1,1270 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
+ *
+ * Written by Jacob Shin - AMD, Inc.
+ * Maintained by: Borislav Petkov <bp@alien8.de>
+ */
+#include <linux/interrupt.h>
+#include <linux/notifier.h>
+#include <linux/kobject.h>
+#include <linux/percpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+#define NR_BLOCKS 5
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_CNTP_HI 0x40000000
+#define MASK_LOCKED_HI 0x20000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
+#define MASK_ERR_COUNT_HI 0x00000FFF
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
+
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF 0xF000
+
+static bool thresholding_irq_en;
+
+struct mce_amd_cpu_data {
+ mce_banks_t thr_intr_banks;
+ mce_banks_t dfr_intr_banks;
+
+ u32 thr_intr_en: 1,
+ dfr_intr_en: 1,
+ __resv: 30;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data);
+
+static const char * const th_names[] = {
+ "load_store",
+ "insn_fetch",
+ "combined_unit",
+ "decode_unit",
+ "northbridge",
+ "execution_unit",
+};
+
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
+};
+
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
+
+struct smca_hwid {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+};
+
+struct smca_bank {
+ const struct smca_hwid *hwid;
+ u32 id; /* Value of MCA_IPID[InstanceId]. */
+ u8 sysfs_id; /* Value used for sysfs name. */
+ u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */
+ __reserved :63;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
+static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
+
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+
+ /* UMC v2 is separate because both of them can exist in a single system. */
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
+};
+
+static const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t];
+}
+
+enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
+{
+ struct smca_bank *b;
+
+ if (bank >= MAX_NR_BANKS)
+ return N_SMCA_BANK_TYPES;
+
+ b = &per_cpu(smca_banks, cpu)[bank];
+ if (!b->hwid)
+ return N_SMCA_BANK_TYPES;
+
+ return b->hwid->bank_type;
+}
+EXPORT_SYMBOL_GPL(smca_get_bank_type);
+
+static const struct smca_hwid smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype } */
+
+ /* Reserved type */
+ { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0) },
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0) },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10) },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1) },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2) },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3) },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5) },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6) },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7) },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
+ { SMCA_UMC_V2, HWID_MCATYPE(0x96, 0x1) },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0) },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0) },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1) },
+
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0) },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1) },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2) },
+
+ /* MPDMA MCA type */
+ { SMCA_MPDMA, HWID_MCATYPE(0x01, 0x3) },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0) },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0) },
+ { SMCA_PCIE_V2, HWID_MCATYPE(0x46, 0x1) },
+
+ { SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0) },
+ { SMCA_NBIF, HWID_MCATYPE(0x6C, 0x0) },
+ { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
+ { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
+ { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
+ { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
+ { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
+ { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
+ { SMCA_GMI_PHY, HWID_MCATYPE(0x269, 0x0) },
+};
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ /* List of threshold blocks within this MCA bank. */
+ struct list_head miscj;
+};
+
+static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
+
+/*
+ * A list of the banks enabled on each logical CPU. Controls which respective
+ * descriptors to initialize later in mce_threshold_create_device().
+ */
+static DEFINE_PER_CPU(u64, bank_map);
+
+static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
+
+static void smca_configure(unsigned int bank, unsigned int cpu)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u8 *bank_counts = this_cpu_ptr(smca_bank_counts);
+ const struct smca_hwid *s_hwid;
+ unsigned int i, hwid_mcatype;
+ u32 high, low;
+ u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+ /* Set appropriate bits in MCA_CONFIG */
+ if (!rdmsr_safe(smca_config, &low, &high)) {
+ /*
+ * OS is required to set the MCAX bit to acknowledge that it is
+ * now using the new MSR ranges and new registers under each
+ * bank. It also means that the OS will configure deferred
+ * errors in the new MCx_CONFIG register. If the bit is not set,
+ * uncorrectable errors will cause a system panic.
+ *
+ * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+ */
+ high |= BIT(0);
+
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) {
+ __set_bit(bank, data->dfr_intr_banks);
+ high |= BIT(5);
+ }
+
+ /*
+ * SMCA Corrected Error Interrupt
+ *
+ * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can
+ * send an MCA Thresholding interrupt without the OS initializing
+ * this feature. This can be used if the threshold limit is managed
+ * by the platform.
+ *
+ * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR).
+ * The OS should set this to inform the platform that the OS is ready
+ * to handle the MCA Thresholding interrupt.
+ */
+ if ((low & BIT(10)) && data->thr_intr_en) {
+ __set_bit(bank, data->thr_intr_banks);
+ high |= BIT(8);
+ }
+
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
+ if (low & MCI_CONFIG_PADDRV)
+ this_cpu_ptr(smca_banks)[bank].paddrv = 1;
+
+ wrmsr(smca_config, low, high);
+ }
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ s_hwid = &smca_hwid_mcatypes[i];
+
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+ this_cpu_ptr(smca_banks)[bank].hwid = s_hwid;
+ this_cpu_ptr(smca_banks)[bank].id = low;
+ this_cpu_ptr(smca_banks)[bank].sysfs_id = bank_counts[s_hwid->bank_type]++;
+ break;
+ }
+ }
+}
+
+struct thresh_restart {
+ struct threshold_block *b;
+ int set_lvt_off;
+ int lvt_off;
+ u16 old_limit;
+};
+
+static const char *bank4_names(const struct threshold_block *b)
+{
+ switch (b->address) {
+ /* MSR4_MISC0 */
+ case 0x00000413:
+ return "dram";
+
+ case 0xc0000408:
+ return "ht_links";
+
+ case 0xc0000409:
+ return "l3_cache";
+
+ default:
+ WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ return "";
+ }
+};
+
+
+static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
+{
+ /*
+ * bank 4 supports APIC LVT interrupts implicitly since forever.
+ */
+ if (bank == 4)
+ return true;
+
+ /*
+ * IntP: interrupt present; if this bit is set, the thresholding
+ * bank can generate APIC LVT interrupts
+ */
+ return msr_high_bits & BIT(28);
+}
+
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ /*
+ * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+ * the BIOS provides the value. The original field where LVT offset
+ * was set is reserved. Return early here:
+ */
+ if (mce_flags.smca)
+ return false;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ if (apic != msr) {
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return false;
+ }
+
+ return true;
+};
+
+/* Reprogram MCx_MISC MSR behind this threshold block. */
+static void threshold_restart_block(void *_tr)
+{
+ struct thresh_restart *tr = _tr;
+ u32 hi, lo;
+
+ /* sysfs write might race against an offline operation */
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
+ return;
+
+ rdmsr(tr->b->address, lo, hi);
+
+ /*
+ * Reset error count and overflow bit.
+ * This is done during init or after handling an interrupt.
+ */
+ if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) {
+ hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI);
+ hi |= THRESHOLD_MAX - tr->b->threshold_limit;
+ } else if (tr->old_limit) { /* change limit w/o reset */
+ int new_count = (hi & THRESHOLD_MAX) +
+ (tr->old_limit - tr->b->threshold_limit);
+
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
+ (new_count & THRESHOLD_MAX);
+ }
+
+ /* clear IntType */
+ hi &= ~MASK_INT_TYPE_HI;
+
+ if (!tr->b->interrupt_capable)
+ goto done;
+
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
+ if (tr->b->interrupt_enable)
+ hi |= INT_TYPE_APIC;
+
+ done:
+
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void threshold_restart_bank(unsigned int bank, bool intr_en)
+{
+ struct threshold_bank **thr_banks = this_cpu_read(threshold_banks);
+ struct threshold_block *block, *tmp;
+ struct thresh_restart tr;
+
+ if (!thr_banks || !thr_banks[bank])
+ return;
+
+ memset(&tr, 0, sizeof(tr));
+
+ list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) {
+ tr.b = block;
+ tr.b->interrupt_enable = intr_en;
+ threshold_restart_block(&tr);
+ }
+}
+
+/* Try to use the threshold limit reported through APEI. */
+static u16 get_thr_limit(void)
+{
+ u32 thr_limit = mce_get_apei_thr_limit();
+
+ /* Fallback to old default if APEI limit is not available. */
+ if (!thr_limit)
+ return THRESHOLD_MAX;
+
+ return min(thr_limit, THRESHOLD_MAX);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = get_thr_limit();
+ threshold_restart_block(&tr);
+};
+
+static int setup_APIC_mce_threshold(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block,
+ unsigned int cpu)
+{
+ u32 addr = 0, offset = 0;
+
+ if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS))
+ return addr;
+
+ if (mce_flags.smca) {
+ if (!block)
+ return MSR_AMD64_SMCA_MCx_MISC(bank);
+
+ if (!(low & MASK_BLKPTR_LO))
+ return 0;
+
+ return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = mca_msr_reg(bank, MCA_MISC);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
+static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+ int offset, u32 misc_high)
+{
+ unsigned int cpu = smp_processor_id();
+ struct threshold_block b;
+ int new;
+
+ if (!block)
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
+
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = addr;
+ b.interrupt_capable = lvt_interrupt_supported(bank, misc_high);
+
+ if (!b.interrupt_capable)
+ goto done;
+
+ __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+ b.interrupt_enable = 1;
+
+ if (mce_flags.smca)
+ goto done;
+
+ new = (misc_high & MASK_LVTOFF_HI) >> 20;
+ offset = setup_APIC_mce_threshold(offset, new);
+ if (offset == new)
+ thresholding_irq_en = true;
+
+done:
+ mce_threshold_block_init(&b, offset);
+
+ return offset;
+}
+
+bool amd_filter_mce(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->extcpu, m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* See Family 17h Models 10h-2Fh Erratum #1114. */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
+ return true;
+
+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not supported on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
+ */
+static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
+{
+ int i, num_msrs;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
+
+ if (smca_get_bank_type(smp_processor_id(), bank) != SMCA_IF)
+ return;
+
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
+ return;
+ }
+
+ rdmsrq(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < num_msrs; i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrq(MSR_K7_HWCR, hwcr);
+}
+
+static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
+ /*
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
+ */
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
+}
+
+/*
+ * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is
+ * ready to send interrupts.
+ *
+ * Individual error sources are enabled later during per-bank init.
+ */
+static void smca_enable_interrupt_vectors(void)
+{
+ struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data);
+ u64 mca_intr_cfg, offset;
+
+ if (!mce_flags.smca || !mce_flags.succor)
+ return;
+
+ if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg))
+ return;
+
+ offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12;
+ if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->thr_intr_en = 1;
+
+ offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4;
+ if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0))
+ data->dfr_intr_en = 1;
+}
+
+/* cpu init entry point, called from mce.c with preempt off */
+void mce_amd_feature_init(struct cpuinfo_x86 *c)
+{
+ unsigned int bank, block, cpu = smp_processor_id();
+ u32 low = 0, high = 0, address = 0;
+ int offset = -1;
+
+ amd_apply_cpu_quirks(c);
+
+ mce_flags.amd_threshold = 1;
+
+ smca_enable_interrupt_vectors();
+
+ for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
+ if (mce_flags.smca) {
+ smca_configure(bank, cpu);
+
+ if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en)
+ continue;
+ }
+
+ disable_err_thresholding(c, bank);
+
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ address = get_block_address(address, low, high, bank, block, cpu);
+ if (!address)
+ break;
+
+ if (rdmsr_safe(address, &low, &high))
+ break;
+
+ if (!(high & MASK_VALID_HI))
+ continue;
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ continue;
+
+ offset = prepare_threshold_block(bank, block, address, offset, high);
+ }
+ }
+}
+
+void smca_bsp_init(void)
+{
+ mce_threshold_vector = amd_threshold_interrupt;
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+}
+
+/*
+ * DRAM ECC errors are reported in the Northbridge (bank 4) with
+ * Extended Error Code 8.
+ */
+static bool legacy_mce_is_memory_error(struct mce *m)
+{
+ return m->bank == 4 && XEC(m->status, 0x1f) == 8;
+}
+
+/*
+ * DRAM ECC errors are reported in Unified Memory Controllers with
+ * Extended Error Code 0.
+ */
+static bool smca_mce_is_memory_error(struct mce *m)
+{
+ enum smca_bank_types bank_type;
+
+ if (XEC(m->status, 0x3f))
+ return false;
+
+ bank_type = smca_get_bank_type(m->extcpu, m->bank);
+
+ return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
+}
+
+bool amd_mce_is_memory_error(struct mce *m)
+{
+ if (mce_flags.smca)
+ return smca_mce_is_memory_error(m);
+ else
+ return legacy_mce_is_memory_error(m);
+}
+
+/*
+ * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a
+ * system physical address. Individual cases though, need to be detected for
+ * other systems. Future cases will be added as needed.
+ *
+ * 1) General case
+ * a) Assume address is not usable.
+ * 2) Poison errors
+ * a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
+ * northbridge (bank 4).
+ * b) Refers to poison consumption in the core. Does not include "no action",
+ * "action optional", or "deferred" error severities.
+ * c) Will include a usable address so that immediate action can be taken.
+ * 3) Northbridge DRAM ECC errors
+ * a) Reported in legacy bank 4 with extended error code (XEC) 8.
+ * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
+ * this bit should not be checked.
+ * 4) MCI_STATUS_PADDRVAL is set
+ * a) Will provide a valid system physical address.
+ *
+ * NOTE: SMCA UMC memory errors fall into case #1.
+ */
+bool amd_mce_usable_address(struct mce *m)
+{
+ /* Check special northbridge case 3) first. */
+ if (!mce_flags.smca) {
+ if (legacy_mce_is_memory_error(m))
+ return true;
+ else if (m->bank == 4)
+ return false;
+ }
+
+ if (this_cpu_ptr(smca_banks)[m->bank].paddrv)
+ return m->status & MCI_STATUS_PADDRV;
+
+ /* Check poison bit for all other bank types. */
+ if (m->status & MCI_STATUS_POISON)
+ return true;
+
+ /* Assume address is not usable for all others. */
+ return false;
+}
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
+{
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ apic_eoi();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks);
+}
+
+void mce_amd_handle_storm(unsigned int bank, bool on)
+{
+ threshold_restart_bank(bank, on);
+}
+
+static void amd_reset_thr_limit(unsigned int bank)
+{
+ threshold_restart_bank(bank, true);
+}
+
+/*
+ * Threshold interrupt handler will service THRESHOLD_APIC_VECTOR. The interrupt
+ * goes off when error_count reaches threshold_limit.
+ */
+static void amd_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks);
+}
+
+void amd_clear_bank(struct mce *m)
+{
+ amd_reset_thr_limit(m->bank);
+
+ /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */
+ if (m->status & MCI_STATUS_DEFERRED)
+ mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0);
+
+ /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ return;
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Sysfs Interface
+ */
+
+struct threshold_attr {
+ struct attribute attr;
+ ssize_t (*show) (struct threshold_block *, char *);
+ ssize_t (*store) (struct threshold_block *, const char *, size_t count);
+};
+
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
+{ \
+ return sprintf(buf, "%lu\n", (unsigned long) b->name); \
+}
+SHOW_FIELDS(interrupt_enable)
+SHOW_FIELDS(threshold_limit)
+
+static ssize_t
+store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (!b->interrupt_capable)
+ return -EINVAL;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ b->interrupt_enable = !!new;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t
+store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
+{
+ struct thresh_restart tr;
+ unsigned long new;
+
+ if (kstrtoul(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (new > THRESHOLD_MAX)
+ new = THRESHOLD_MAX;
+ if (new < 1)
+ new = 1;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.old_limit = b->threshold_limit;
+ b->threshold_limit = new;
+ tr.b = b;
+
+ if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1))
+ return -ENODEV;
+
+ return size;
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+ u32 lo, hi;
+
+ /* CPU might be offline by now */
+ if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi))
+ return -ENODEV;
+
+ return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ (THRESHOLD_MAX - b->threshold_limit)));
+}
+
+static struct threshold_attr error_count = {
+ .attr = {.name = __stringify(error_count), .mode = 0444 },
+ .show = show_error_count,
+};
+
+#define RW_ATTR(val) \
+static struct threshold_attr val = { \
+ .attr = {.name = __stringify(val), .mode = 0644 }, \
+ .show = show_## val, \
+ .store = store_## val, \
+};
+
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+
+static struct attribute *default_attrs[] = {
+ &threshold_limit.attr,
+ &error_count.attr,
+ NULL, /* possibly interrupt_enable if supported, see below */
+ NULL,
+};
+ATTRIBUTE_GROUPS(default);
+
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
+
+static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->show ? a->show(b, buf) : -EIO;
+
+ return ret;
+}
+
+static ssize_t store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct threshold_block *b = to_block(kobj);
+ struct threshold_attr *a = to_attr(attr);
+ ssize_t ret;
+
+ ret = a->store ? a->store(b, buf, count) : -EIO;
+
+ return ret;
+}
+
+static const struct sysfs_ops threshold_ops = {
+ .show = show,
+ .store = store,
+};
+
+static void threshold_block_release(struct kobject *kobj);
+
+static const struct kobj_type threshold_ktype = {
+ .sysfs_ops = &threshold_ops,
+ .default_groups = default_groups,
+ .release = threshold_block_release,
+};
+
+static const char *get_name(unsigned int cpu, unsigned int bank, struct threshold_block *b)
+{
+ enum smca_bank_types bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ bank_type = smca_get_bank_type(cpu, bank);
+
+ if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ }
+
+ if (b && b->block) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block);
+ return buf_mcatype;
+ }
+
+ if (bank_type >= N_SMCA_BANK_TYPES) {
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank);
+ return buf_mcatype;
+ }
+
+ if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1)
+ return smca_get_name(bank_type);
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%u", smca_get_name(bank_type),
+ per_cpu(smca_banks, cpu)[bank].sysfs_id);
+ return buf_mcatype;
+}
+
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
+{
+ struct threshold_block *b = NULL;
+ u32 low, high;
+ int err;
+
+ if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe(address, &low, &high))
+ return 0;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_CNTP_HI) ||
+ (high & MASK_LOCKED_HI))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->interrupt_capable = lvt_interrupt_supported(bank, high);
+ b->threshold_limit = get_thr_limit();
+
+ if (b->interrupt_capable) {
+ default_attrs[2] = &interrupt_enable.attr;
+ b->interrupt_enable = 1;
+ } else {
+ default_attrs[2] = NULL;
+ }
+
+ list_add(&b->miscj, &tb->miscj);
+
+ mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20);
+
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b));
+ if (err)
+ goto out_free;
+recurse:
+ address = get_block_address(address, low, high, bank, ++block, cpu);
+ if (!address)
+ return 0;
+
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
+ if (err)
+ goto out_free;
+
+ if (b)
+ kobject_uevent(&b->kobj, KOBJ_ADD);
+
+ return 0;
+
+out_free:
+ if (b) {
+ list_del(&b->miscj);
+ kobject_put(&b->kobj);
+ }
+ return err;
+}
+
+static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
+ unsigned int bank)
+{
+ struct device *dev = this_cpu_read(mce_device);
+ struct threshold_bank *b = NULL;
+ const char *name = get_name(cpu, bank, NULL);
+ int err = 0;
+
+ if (!dev)
+ return -ENODEV;
+
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ if (!b) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Associate the bank with the per-CPU MCE device */
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
+ if (!b->kobj) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
+ if (err)
+ goto out_kobj;
+
+ bp[bank] = b;
+ return 0;
+
+out_kobj:
+ kobject_put(b->kobj);
+out_free:
+ kfree(b);
+out:
+ return err;
+}
+
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void threshold_remove_bank(struct threshold_bank *bank)
+{
+ struct threshold_block *pos, *tmp;
+
+ list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) {
+ list_del(&pos->miscj);
+ kobject_put(&pos->kobj);
+ }
+
+ kobject_put(bank->kobj);
+ kfree(bank);
+}
+
+static void __threshold_remove_device(struct threshold_bank **bp)
+{
+ unsigned int bank, numbanks = this_cpu_read(mce_num_banks);
+
+ for (bank = 0; bank < numbanks; bank++) {
+ if (!bp[bank])
+ continue;
+
+ threshold_remove_bank(bp[bank]);
+ bp[bank] = NULL;
+ }
+ kfree(bp);
+}
+
+void mce_threshold_remove_device(unsigned int cpu)
+{
+ struct threshold_bank **bp = this_cpu_read(threshold_banks);
+
+ if (!bp)
+ return;
+
+ /*
+ * Clear the pointer before cleaning up, so that the interrupt won't
+ * touch anything of this.
+ */
+ this_cpu_write(threshold_banks, NULL);
+
+ __threshold_remove_device(bp);
+ return;
+}
+
+/**
+ * mce_threshold_create_device - Create the per-CPU MCE threshold device
+ * @cpu: The plugged in CPU
+ *
+ * Create directories and files for all valid threshold banks.
+ *
+ * This is invoked from the CPU hotplug callback which was installed in
+ * mcheck_init_device(). The invocation happens in context of the hotplug
+ * thread running on @cpu. The callback is invoked on all CPUs which are
+ * online when the callback is installed or during a real hotplug event.
+ */
+void mce_threshold_create_device(unsigned int cpu)
+{
+ unsigned int numbanks, bank;
+ struct threshold_bank **bp;
+
+ if (!mce_flags.amd_threshold)
+ return;
+
+ bp = this_cpu_read(threshold_banks);
+ if (bp)
+ return;
+
+ numbanks = this_cpu_read(mce_num_banks);
+ bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL);
+ if (!bp)
+ return;
+
+ for (bank = 0; bank < numbanks; ++bank) {
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
+ continue;
+ if (threshold_create_bank(bp, cpu, bank)) {
+ __threshold_remove_device(bp);
+ return;
+ }
+ }
+ this_cpu_write(threshold_banks, bp);
+
+ if (thresholding_irq_en)
+ mce_threshold_vector = amd_threshold_interrupt;
+ return;
+}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
new file mode 100644
index 000000000000..0a89947e47bc
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <acpi/ghes.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
+{
+ struct mce_hw_err err;
+ struct mce *m;
+ int lsb;
+
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
+ return;
+
+ /*
+ * Even if the ->validation_bits are set for address mask,
+ * to be extra safe, check and reject an error radius '0',
+ * and fall back to the default page size.
+ */
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+ lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+ else
+ lsb = PAGE_SHIFT;
+
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
+ /* Fake a memory read error with unknown channel */
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m->status |= MCI_STATUS_UC;
+
+ if (severity >= GHES_SEV_PANIC) {
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
+ }
+
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
+
+ if (!boot_cpu_has(X86_FEATURE_SMCA))
+ return -EINVAL;
+
+ /*
+ * The starting address of the register array extracted from BERT must
+ * match with the first expected register in the register layout of
+ * SMCA address space. This address corresponds to banks's MCA_STATUS
+ * register.
+ *
+ * Match any MCi_STATUS register by turning off bank numbers.
+ */
+ if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
+ MSR_AMD64_SMCA_MC0_STATUS)
+ return -EINVAL;
+
+ /*
+ * The number of registers in the register array is determined by
+ * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
+ * Sanity-check registers array size.
+ */
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
+ return -EINVAL;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
+ apicid_found = true;
+ break;
+ }
+ }
+
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
+
+ mce_log(&err);
+
+ return 0;
+}
+
+#define CPER_CREATOR_MCE \
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ int rc, pos;
+
+ rc = erst_get_record_id_begin(&pos);
+ if (rc)
+ return rc;
+retry:
+ rc = erst_get_record_id_next(&pos, record_id);
+ if (rc)
+ goto out;
+ /* no more record */
+ if (*record_id == APEI_ERST_INVALID_RECORD_ID)
+ goto out;
+ rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
+ &CPER_CREATOR_MCE);
+ /* someone else has cleared the record, try next one */
+ if (rc == -ENOENT)
+ goto retry;
+ else if (rc < 0)
+ goto out;
+
+ memcpy(m, &rcd.mce, sizeof(*m));
+ rc = sizeof(*m);
+out:
+ erst_get_record_id_end();
+
+ return rc;
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
new file mode 100644
index 000000000000..34440021e8cf
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -0,0 +1,2970 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check handler.
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/thread_info.h>
+#include <linux/capability.h>
+#include <linux/miscdevice.h>
+#include <linux/ratelimit.h>
+#include <linux/rcupdate.h>
+#include <linux/kobject.h>
+#include <linux/uaccess.h>
+#include <linux/kdebug.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
+#include <linux/delay.h>
+#include <linux/ctype.h>
+#include <linux/sched.h>
+#include <linux/sysfs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+#include <linux/nmi.h>
+#include <linux/cpu.h>
+#include <linux/ras.h>
+#include <linux/smp.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/debugfs.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
+#include <linux/set_memory.h>
+#include <linux/sync_core.h>
+#include <linux/task_work.h>
+#include <linux/hardirq.h>
+#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
+
+#include <asm/fred.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/reboot.h>
+#include <asm/tdx.h>
+
+#include "internal.h"
+
+/* sysfs synchronization */
+static DEFINE_MUTEX(mce_sysfs_mutex);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/mce.h>
+
+#define SPINUNIT 100 /* 100ns */
+
+DEFINE_PER_CPU(unsigned, mce_exception_count);
+
+DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+#define ATTR_LEN 16
+/* One object for each MCE bank, shared by all CPUs */
+struct mce_bank_dev {
+ struct device_attribute attr; /* device attribute */
+ char attrname[ATTR_LEN]; /* attribute name */
+ u8 bank; /* bank number */
+};
+static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
+
+struct mce_vendor_flags mce_flags __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ .monarch_timeout = -1
+};
+
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
+static unsigned long mce_need_notify;
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+ [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
+void mce_prep_record_common(struct mce *m)
+{
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP);
+ /* need the internal __ version to avoid deadlocks */
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
+}
+
+DEFINE_PER_CPU(struct mce, injectm);
+EXPORT_PER_CPU_SYMBOL_GPL(injectm);
+
+void mce_log(struct mce_hw_err *err)
+{
+ if (mce_gen_pool_add(err))
+ irq_work_queue(&mce_irq_work);
+}
+EXPORT_SYMBOL_GPL(mce_log);
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
+ return;
+
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
+static void __print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
+
+ if (m->ip) {
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->ip);
+
+ if (m->cs == __KERNEL_CS)
+ pr_cont("{%pS}", (void *)(unsigned long)m->ip);
+ pr_cont("\n");
+ }
+
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
+ if (m->addr)
+ pr_cont("ADDR %llx ", m->addr);
+ if (m->misc)
+ pr_cont("MISC %llx ", m->misc);
+ if (m->ppin)
+ pr_cont("PPIN %llx ", m->ppin);
+
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
+ pr_cont("\n");
+
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ m->microcode);
+}
+
+static void print_mce(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ __print_mce(err);
+
+ if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
+}
+
+#define PANIC_TIMEOUT 5 /* 5 seconds */
+
+static atomic_t mce_panicked;
+
+static int fake_panic;
+static atomic_t mce_fake_panicked;
+
+/* Panic in progress. Enable interrupts and wait for final IPI */
+static void wait_for_panic(void)
+{
+ long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
+
+ preempt_disable();
+ local_irq_enable();
+ while (timeout-- > 0)
+ udelay(1);
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+ panic("Panicing machine check CPU died");
+}
+
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
+{
+ struct llist_node *pending;
+ struct mce_evt_llist *l;
+ int apei_err = 0;
+ const char *memmsg;
+
+ /*
+ * Allow instrumentation around external facilities usage. Not that it
+ * matters a whole lot since the machine is going to panic anyway.
+ */
+ instrumentation_begin();
+
+ if (!fake_panic) {
+ /*
+ * Make sure only one CPU runs in machine check panic
+ */
+ if (atomic_inc_return(&mce_panicked) > 1)
+ wait_for_panic();
+ barrier();
+
+ bust_spinlocks(1);
+ console_verbose();
+ } else {
+ /* Don't log too much for fake panic */
+ if (atomic_inc_return(&mce_fake_panicked) > 1)
+ goto out;
+ }
+ pending = mce_gen_pool_prepare_records();
+ /* First print corrected ones that are still unlogged */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ /* Now print uncorrected but with the final one last */
+ llist_for_each_entry(l, pending, llnode) {
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
+ if (!(m->status & MCI_STATUS_UC))
+ continue;
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
+ }
+ if (final) {
+ print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(&final->m);
+ }
+ if (exp)
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(&final->m);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
+ if (!fake_panic) {
+ if (panic_timeout == 0)
+ panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
+ panic(msg);
+ } else
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
+
+out:
+ instrumentation_end();
+}
+
+/* Support code for software error injection */
+
+static int msr_to_offset(u32 msr)
+{
+ unsigned bank = __this_cpu_read(injectm.bank);
+
+ if (msr == mca_cfg.rip_msr)
+ return offsetof(struct mce, ip);
+ if (msr == mca_msr_reg(bank, MCA_STATUS))
+ return offsetof(struct mce, status);
+ if (msr == mca_msr_reg(bank, MCA_ADDR))
+ return offsetof(struct mce, addr);
+ if (msr == mca_msr_reg(bank, MCA_MISC))
+ return offsetof(struct mce, misc);
+ if (msr == MSR_IA32_MCG_STATUS)
+ return offsetof(struct mce, mcgstatus);
+ return -1;
+}
+
+void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr)
+{
+ if (wrmsr) {
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+ } else {
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+ }
+
+ show_stack_regs(regs);
+
+ panic("MCA architectural violation!\n");
+
+ while (true)
+ cpu_relax();
+}
+
+/* MSR access wrappers used for error injection */
+noinstr u64 mce_rdmsrq(u32 msr)
+{
+ EAX_EDX_DECLARE_ARGS(val, low, high);
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+ u64 ret;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset < 0)
+ ret = 0;
+ else
+ ret = *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
+
+ instrumentation_end();
+
+ return ret;
+ }
+
+ /*
+ * RDMSR on MCA MSRs should not fault. If they do, this is very much an
+ * architectural violation and needs to be reported to hw vendor. Panic
+ * the box to not allow any further progress.
+ */
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR_IN_MCE)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
+
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+noinstr void mce_wrmsrq(u32 msr, u64 v)
+{
+ u32 low, high;
+
+ if (__this_cpu_read(injectm.finished)) {
+ int offset;
+
+ instrumentation_begin();
+
+ offset = msr_to_offset(msr);
+ if (offset >= 0)
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
+
+ instrumentation_end();
+
+ return;
+ }
+
+ low = (u32)v;
+ high = (u32)(v >> 32);
+
+ /* See comment in mce_rdmsrq() */
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE)
+ : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
+{
+ struct mce *m;
+ /*
+ * Enable instrumentation around mce_prep_record() which calls external
+ * facilities.
+ */
+ instrumentation_begin();
+ mce_prep_record(err);
+ instrumentation_end();
+
+ m = &err->m;
+ m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrq(mca_cfg.rip_msr);
+ }
+}
+
+bool mce_available(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return false;
+ return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+static void mce_schedule_work(void)
+{
+ if (!mce_gen_pool_empty())
+ schedule_work(&mce_work);
+}
+
+static void mce_irq_work_cb(struct irq_work *entry)
+{
+ mce_schedule_work();
+}
+
+bool mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_ADDRV))
+ return false;
+
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ return amd_mce_usable_address(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ return intel_mce_usable_address(m);
+
+ default:
+ return true;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_usable_address);
+
+bool mce_is_memory_error(struct mce *m)
+{
+ switch (m->cpuvendor) {
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ return amd_mce_is_memory_error(m);
+
+ case X86_VENDOR_INTEL:
+ case X86_VENDOR_ZHAOXIN:
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
+
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
+bool mce_is_correctable(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
+ return false;
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(mce_is_correctable);
+
+/*
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
+ */
+static bool mce_notify_irq(void)
+{
+ /* Not more than two messages every minute */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
+ if (test_and_clear_bit(0, &mce_need_notify)) {
+ mce_work_trigger();
+
+ if (__ratelimit(&ratelimit))
+ pr_info(HW_ERR "Machine check events logged\n");
+
+ return true;
+ }
+
+ return false;
+}
+
+static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ /* Emit the trace record: */
+ trace_mce_record(err);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block early_nb = {
+ .notifier_call = mce_early_notifier,
+ .priority = MCE_PRIO_EARLY,
+};
+
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce || !mce_usable_address(mce))
+ return NOTIFY_DONE;
+
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0)) {
+ set_mce_nospec(pfn);
+ mce->kflags |= MCE_HANDLED_UC;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce_hw_err *err = to_mce_hw_err(data);
+
+ if (!err)
+ return NOTIFY_DONE;
+
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
+};
+
+/*
+ * Read ADDR and MISC registers.
+ */
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
+{
+ struct mce *m = &err->m;
+
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC));
+
+ if (m->status & MCI_STATUS_ADDRV) {
+ if (m->kflags & MCE_CHECK_DFR_REGS)
+ m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i));
+ else
+ m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+
+ smca_extract_err_addr(m);
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV) {
+ m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
+ }
+}
+
+DEFINE_PER_CPU(unsigned, mce_poll_count);
+
+/*
+ * We have three scenarios for checking for Deferred errors:
+ *
+ * 1) Non-SMCA systems check MCA_STATUS and log error if found.
+ * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
+ * clear MCA_DESTAT.
+ * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
+ * log it.
+ */
+static bool smca_should_log_poll_error(struct mce *m)
+{
+ if (m->status & MCI_STATUS_VAL)
+ return true;
+
+ m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank));
+ if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) {
+ m->kflags |= MCE_CHECK_DFR_REGS;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Newer Intel systems that support software error
+ * recovery need to make additional checks. Other
+ * CPUs should skip over uncorrected errors, but log
+ * everything else.
+ */
+static bool ser_should_log_poll_error(struct mce *m)
+{
+ /* Log "not enabled" (speculative) errors */
+ if (!(m->status & MCI_STATUS_EN))
+ return true;
+
+ /*
+ * Log UCNA (SDM: 15.6.3 "UCR Error Classification")
+ * UC == 1 && PCC == 0 && S == 0
+ */
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
+ return true;
+
+ return false;
+}
+
+static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ if (mce_flags.smca)
+ return smca_should_log_poll_error(m);
+
+ /* If this entry is not valid, ignore it. */
+ if (!(m->status & MCI_STATUS_VAL))
+ return false;
+
+ /*
+ * If we are logging everything (at CPU online) or this
+ * is a corrected error, then we must log it.
+ */
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
+ return true;
+
+ if (mca_cfg.ser)
+ return ser_should_log_poll_error(m);
+
+ if (m->status & MCI_STATUS_UC)
+ return false;
+
+ return true;
+}
+
+static void clear_bank(struct mce *m)
+{
+ if (m->cpuvendor == X86_VENDOR_AMD)
+ return amd_clear_bank(m);
+
+ mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0);
+}
+
+/*
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ *
+ * Note: spec recommends to panic for fatal unsignalled
+ * errors here. However this would be quite problematic --
+ * we would need to reimplement the Monarch handling and
+ * it would mess up the exclusion between exception handler
+ * and poll handler -- * so we skip this for now.
+ * These cases should not happen anyways, or only when the CPU
+ * is already totally * confused. In this case it's likely it will
+ * not fully execute the machine check handler either.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mce_hw_err err;
+ struct mce *m;
+ int i;
+
+ this_cpu_inc(mce_poll_count);
+
+ mce_gather_info(&err, NULL);
+ m = &err.m;
+
+ if (flags & MCP_TIMESTAMP)
+ m->tsc = rdtsc();
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (!mce_banks[i].ctl || !test_bit(i, *b))
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ barrier();
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(m);
+
+ /* Verify that the error should be logged based on hardware conditions. */
+ if (!should_log_poll_error(flags, &err))
+ continue;
+
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
+ /*
+ * Don't get the IP here because it's unlikely to
+ * have anything to do with the actual error location.
+ */
+
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
+ goto clear_it;
+
+ if (flags & MCP_QUEUE_LOG)
+ mce_gen_pool_add(&err);
+ else
+ mce_log(&err);
+
+clear_it:
+ clear_bank(m);
+ }
+
+ /*
+ * Don't clear MCG_STATUS here because it's only defined for
+ * exceptions.
+ */
+
+ sync_core();
+}
+EXPORT_SYMBOL_GPL(machine_check_poll);
+
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static __always_inline void
+quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0);
+
+ instrumentation_begin();
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ instrumentation_end();
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Some Zen-based Instruction Fetch Units set EIPV=RIPV=0 on poison consumption
+ * errors. This means mce_gather_info() will not save the "ip" and "cs" registers.
+ *
+ * However, the context is still valid, so save the "cs" register for later use.
+ *
+ * The "ip" register is truly unknown, so don't save it or fixup EIPV/RIPV.
+ *
+ * The Instruction Fetch Unit is at MCA bank 1 for all affected systems.
+ */
+static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 1)
+ return;
+ if (!(m->status & MCI_STATUS_POISON))
+ return;
+
+ m->cs = regs->cs;
+}
+
+/*
+ * Do a quick check if any of the events requires a panic.
+ * This decides if we keep the events around or clear them.
+ */
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
+{
+ struct mce *m = &err->m;
+ char *tmp = *msg;
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ arch___set_bit(i, validp);
+ if (mce_flags.snb_ifu_quirk)
+ quirk_sandybridge_ifu(i, m, regs);
+
+ if (mce_flags.zen_ifu_quirk)
+ quirk_zen_ifu(i, m, regs);
+
+ m->bank = i;
+ if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ mce_read_aux(err, i);
+ *msg = tmp;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Variable to establish order between CPUs while scanning.
+ * Each CPU spins initially until executing is equal its number.
+ */
+static atomic_t mce_executing;
+
+/*
+ * Defines order of CPUs on entry. First CPU becomes Monarch.
+ */
+static atomic_t mce_callin;
+
+/*
+ * Track which CPUs entered the MCA broadcast synchronization and which not in
+ * order to print holdouts.
+ */
+static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
+
+/*
+ * Check if a timeout waiting for other CPUs happened.
+ */
+static noinstr int mce_timed_out(u64 *t, const char *msg)
+{
+ int ret = 0;
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * The others already did panic for some reason.
+ * Bail out like in a timeout.
+ * rmb() to tell the compiler that system_state
+ * might have been modified by someone else.
+ */
+ rmb();
+ if (atomic_read(&mce_panicked))
+ wait_for_panic();
+ if (!mca_cfg.monarch_timeout)
+ goto out;
+ if ((s64)*t < SPINUNIT) {
+ if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
+ pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
+ cpumask_pr_args(&mce_missing_cpus));
+ mce_panic(msg, NULL, NULL);
+
+ ret = 1;
+ goto out;
+ }
+ *t -= SPINUNIT;
+
+out:
+ touch_nmi_watchdog();
+
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * The Monarch's reign. The Monarch is the CPU who entered
+ * the machine check handler first. It waits for the others to
+ * raise the exception too and then grades them. When any
+ * error is fatal panic. Only then let the others continue.
+ *
+ * The other CPUs entering the MCE handler will be controlled by the
+ * Monarch. They are called Subjects.
+ *
+ * This way we prevent any potential data corruption in a unrecoverable case
+ * and also makes sure always all CPU's errors are examined.
+ *
+ * Also this detects the case of a machine check event coming from outer
+ * space (not detected by any CPUs) In this case some external agent wants
+ * us to shut down, so panic too.
+ *
+ * The other CPUs might still decide to panic if the handler happens
+ * in a unrecoverable place, but in this case the system is in a semi-stable
+ * state and won't corrupt anything by itself. It's ok to let the others
+ * continue for a bit first.
+ *
+ * All the spin loops have timeouts; when a timeout happens a CPU
+ * typically elects itself to be Monarch.
+ */
+static void mce_reign(void)
+{
+ struct mce_hw_err *err = NULL;
+ struct mce *m = NULL;
+ int global_worst = 0;
+ char *msg = NULL;
+ int cpu;
+
+ /*
+ * This CPU is the Monarch and the other CPUs have run
+ * through their handlers.
+ * Grade the severity of the errors of all the CPUs.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
+
+ if (mtmp->severity > global_worst) {
+ global_worst = mtmp->severity;
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
+ }
+ }
+
+ /*
+ * Cannot recover? Panic here then.
+ * This dumps all the mces in the log buffer and stops the
+ * other CPUs.
+ */
+ if (m && global_worst >= MCE_PANIC_SEVERITY) {
+ /* call mce_severity() to get "msg" for panic */
+ mce_severity(m, NULL, &msg, true);
+ mce_panic("Fatal machine check", err, msg);
+ }
+
+ /*
+ * For UC somewhere we let the CPU who detects it handle it.
+ * Also must let continue the others, otherwise the handling
+ * CPU could deadlock on a lock.
+ */
+
+ /*
+ * No machine check event found. Must be some external
+ * source or one CPU is hung. Panic.
+ */
+ if (global_worst <= MCE_KEEP_SEVERITY)
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
+
+ /*
+ * Now clear all the hw_errs_seen so that they don't reappear on
+ * the next mce.
+ */
+ for_each_possible_cpu(cpu)
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
+}
+
+static atomic_t global_nwo;
+
+/*
+ * Start of Monarch synchronization. This waits until all CPUs have
+ * entered the exception handler and then determines if any of them
+ * saw a fatal event that requires panic. Then it executes them
+ * in the entry order.
+ * TBD double check parallel CPU hotunplug
+ */
+static noinstr int mce_start(int *no_way_out)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int order, ret = -1;
+
+ if (!timeout)
+ return ret;
+
+ raw_atomic_add(*no_way_out, &global_nwo);
+ /*
+ * Rely on the implied barrier below, such that global_nwo
+ * is updated before mce_callin.
+ */
+ order = raw_atomic_inc_return(&mce_callin);
+ arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
+
+ /* Enable instrumentation around calls to external facilities */
+ instrumentation_begin();
+
+ /*
+ * Wait for everyone.
+ */
+ while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * mce_callin should be read before global_nwo
+ */
+ smp_rmb();
+
+ if (order == 1) {
+ /*
+ * Monarch: Starts executing now, the others wait.
+ */
+ raw_atomic_set(&mce_executing, 1);
+ } else {
+ /*
+ * Subject: Now start the scanning loop one by one in
+ * the original callin order.
+ * This way when there are any shared banks it will be
+ * only seen by one CPU before cleared, avoiding duplicates.
+ */
+ while (raw_atomic_read(&mce_executing) < order) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
+ raw_atomic_set(&global_nwo, 0);
+ goto out;
+ }
+ ndelay(SPINUNIT);
+ }
+ }
+
+ /*
+ * Cache the global no_way_out state.
+ */
+ *no_way_out = raw_atomic_read(&global_nwo);
+
+ ret = order;
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+/*
+ * Synchronize between CPUs after main scanning loop.
+ * This invokes the bulk of the Monarch processing.
+ */
+static noinstr int mce_end(int order)
+{
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
+ int ret = -1;
+
+ /* Allow instrumentation around external facilities. */
+ instrumentation_begin();
+
+ if (!timeout)
+ goto reset;
+ if (order < 0)
+ goto reset;
+
+ /*
+ * Allow others to run.
+ */
+ atomic_inc(&mce_executing);
+
+ if (order == 1) {
+ /*
+ * Monarch: Wait for everyone to go through their scanning
+ * loops.
+ */
+ while (atomic_read(&mce_executing) <= num_online_cpus()) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ mce_reign();
+ barrier();
+ ret = 0;
+ } else {
+ /*
+ * Subject: Wait for Monarch to finish.
+ */
+ while (atomic_read(&mce_executing) != 0) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
+ goto reset;
+ ndelay(SPINUNIT);
+ }
+
+ /*
+ * Don't reset anything. That's done by the Monarch.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * Reset all global state.
+ */
+reset:
+ atomic_set(&global_nwo, 0);
+ atomic_set(&mce_callin, 0);
+ cpumask_setall(&mce_missing_cpus);
+ barrier();
+
+ /*
+ * Let others run again.
+ */
+ atomic_set(&mce_executing, 0);
+
+out:
+ instrumentation_end();
+
+ return ret;
+}
+
+static __always_inline void mce_clear_state(unsigned long *toclear)
+{
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ if (arch_test_bit(i, toclear))
+ mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+ }
+}
+
+/*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+static noinstr bool mce_check_crashing_cpu(void)
+{
+ unsigned int cpu = smp_processor_id();
+
+ if (arch_cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
+ u64 mcgstatus;
+
+ mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+ }
+
+ if (mcgstatus & MCG_STATUS_RIPV) {
+ native_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+ return true;
+ }
+ }
+ return false;
+}
+
+static __always_inline int
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ struct mca_config *cfg = &mca_cfg;
+ int severity, i, taint = 0;
+ struct mce *m = &err->m;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ arch___clear_bit(i, toclear);
+ if (!arch_test_bit(i, valid_banks))
+ continue;
+
+ if (!mce_banks[i].ctl)
+ continue;
+
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
+
+ m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS));
+ if (!(m->status & MCI_STATUS_VAL))
+ continue;
+
+ /*
+ * Corrected or non-signaled errors are handled by
+ * machine_check_poll(). Leave them alone, unless this panics.
+ */
+ if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ !no_way_out)
+ continue;
+
+ /* Set taint even when machine check was not enabled. */
+ taint++;
+
+ severity = mce_severity(m, regs, NULL, true);
+
+ /*
+ * When machine check was for corrected/deferred handler don't
+ * touch, unless we're panicking.
+ */
+ if ((severity == MCE_KEEP_SEVERITY ||
+ severity == MCE_UCNA_SEVERITY) && !no_way_out)
+ continue;
+
+ arch___set_bit(i, toclear);
+
+ /* Machine check event was not enabled. Clear, but ignore. */
+ if (severity == MCE_NO_SEVERITY)
+ continue;
+
+ mce_read_aux(err, i);
+
+ /* assuming valid severity level != 0 */
+ m->severity = severity;
+
+ /*
+ * Enable instrumentation around the mce_log() call which is
+ * done in #MC context, where instrumentation is disabled.
+ */
+ instrumentation_begin();
+ mce_log(err);
+ instrumentation_end();
+
+ if (severity > *worst) {
+ *final = *err;
+ *worst = severity;
+ }
+ }
+
+ /* mce_clear_state will clear *final, save locally for use later */
+ *err = *final;
+
+ return taint;
+}
+
+static void kill_me_now(struct callback_head *ch)
+{
+ struct task_struct *p = container_of(ch, struct task_struct, mce_kill_me);
+
+ p->mce_count = 0;
+ force_sig(SIGBUS);
+}
+
+static void kill_me_maybe(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
+ int ret;
+
+ p->mce_count = 0;
+ pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
+
+ if (!p->mce_ripv)
+ flags |= MF_MUST_KILL;
+
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
+ if (!ret) {
+ set_mce_nospec(pfn);
+ sync_core();
+ return;
+ }
+
+ /*
+ * -EHWPOISON from memory_failure() means that it already sent SIGBUS
+ * to the current process with the proper error info,
+ * -EOPNOTSUPP means hwpoison_filter() filtered the error event,
+ *
+ * In both cases, no further processing is required.
+ */
+ if (ret == -EHWPOISON || ret == -EOPNOTSUPP)
+ return;
+
+ pr_err("Memory error not recovered");
+ kill_me_now(cb);
+}
+
+static void kill_me_never(struct callback_head *cb)
+{
+ struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
+
+ p->mce_count = 0;
+ pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
+}
+
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
+{
+ int count = ++current->mce_count;
+ struct mce *m = &err->m;
+
+ /* First call, save all the details */
+ if (count == 1) {
+ current->mce_addr = m->addr;
+ current->mce_kflags = m->kflags;
+ current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(m);
+ current->mce_kill_me.func = func;
+ }
+
+ /* Ten is likely overkill. Don't expect more than two faults before task_work() */
+ if (count > 10)
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
+
+ /* Second or later call, make sure page address matches the one from first call */
+ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
+
+ /* Do not call task_work_add() more than once */
+ if (count > 1)
+ return;
+
+ task_work_add(current, &current->mce_kill_me, TWA_RESUME);
+}
+
+/* Handle unconfigured int18 (should never happen) */
+static noinstr void unexpected_machine_check(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
+ smp_processor_id());
+ instrumentation_end();
+}
+
+/*
+ * The actual machine check handler. This only handles real exceptions when
+ * something got corrupted coming in through int 18.
+ *
+ * This is executed in #MC context not subject to normal locking rules.
+ * This implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
+ *
+ * On Intel systems this is entered on all CPUs in parallel through
+ * MCE broadcast. However some CPUs might be broken beyond repair,
+ * so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
+ *
+ * Currently, the #MC handler calls out to a number of external facilities
+ * and, therefore, allows instrumentation around them. The optimal thing to
+ * have would be to do the absolutely minimal work required in #MC context
+ * and have instrumentation disabled only around that. Further processing can
+ * then happen in process context where instrumentation is allowed. Achieving
+ * that requires careful auditing and modifications. Until then, the code
+ * allows instrumentation temporarily, where required. *
+ */
+noinstr void do_machine_check(struct pt_regs *regs)
+{
+ int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
+ DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
+ char *msg = NULL;
+ struct mce *m;
+
+ if (unlikely(mce_flags.p5))
+ return pentium_machine_check(regs);
+ else if (unlikely(mce_flags.winchip))
+ return winchip_machine_check(regs);
+ else if (unlikely(!mca_cfg.initialized))
+ return unexpected_machine_check(regs);
+
+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ goto clear;
+
+ /*
+ * Establish sequential order between the CPUs entering the machine
+ * check handler.
+ */
+ order = -1;
+
+ /*
+ * If no_way_out gets set, there is no safe way to recover from this
+ * MCE.
+ */
+ no_way_out = 0;
+
+ /*
+ * If kill_current_task is not set, there might be a way to recover from this
+ * error.
+ */
+ kill_current_task = 0;
+
+ /*
+ * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
+ * on Intel.
+ */
+ lmce = 1;
+
+ this_cpu_inc(mce_exception_count);
+
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
+
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
+
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
+
+ barrier();
+
+ /*
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
+ */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
+ kill_current_task = 1;
+ /*
+ * Check if this MCE is signaled to only this logical processor,
+ * on Intel, Zhaoxin only.
+ */
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
+
+ /*
+ * Local machine check may already know that we have to panic.
+ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+ * to see it will clear it.
+ */
+ if (lmce) {
+ if (no_way_out)
+ mce_panic("Fatal local machine check", &err, msg);
+ } else {
+ order = mce_start(&no_way_out);
+ }
+
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
+
+ if (!no_way_out)
+ mce_clear_state(toclear);
+
+ /*
+ * Do most of the synchronization with other CPUs.
+ * When there's any problem use only local no_way_out state.
+ */
+ if (!lmce) {
+ if (mce_end(order) < 0) {
+ if (!no_way_out)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &err, msg);
+ }
+ } else {
+ /*
+ * If there was a fatal machine check we should have
+ * already called mce_panic earlier in this function.
+ * Since we re-read the banks, we might have found
+ * something new. Check again to see if we found a
+ * fatal error. We call "mce_severity()" again to
+ * make sure we have the right "msg".
+ */
+ if (worst >= MCE_PANIC_SEVERITY) {
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
+ }
+ }
+
+ /*
+ * Enable instrumentation around the external facilities like task_work_add()
+ * (via queue_task_work()), fixup_exception() etc. For now, that is. Fixing this
+ * properly would need a lot more involved reorganization.
+ */
+ instrumentation_begin();
+
+ if (taint)
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
+ goto out;
+
+ /* Fault was in user mode and we need to take some action */
+ if ((m->cs & 3) == 3) {
+ /* If this triggers there is no way to recover. Die hard. */
+ BUG_ON(!on_thread_stack() || !user_mode(regs));
+
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
+ else
+ queue_task_work(&err, msg, kill_me_maybe);
+
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
+ } else {
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
+ mce_panic("Failed kernel mode recovery", &err, msg);
+ }
+
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
+ }
+
+out:
+ /* Given it didn't panic, mark it as recoverable */
+ hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
+ instrumentation_end();
+
+clear:
+ mce_wrmsrq(MSR_IA32_MCG_STATUS, 0);
+}
+EXPORT_SYMBOL_GPL(do_machine_check);
+
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int flags)
+{
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
+}
+#endif
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll 2x faster. When the poller finds no more
+ * errors, poll 2x slower (up to check_interval seconds).
+ */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
+
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+static void __start_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
+
+ local_irq_restore(flags);
+}
+
+static void mc_poll_banks_default(void)
+{
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+}
+
+void (*mc_poll_banks)(void) = mc_poll_banks_default;
+
+static bool should_enable_timer(unsigned long iv)
+{
+ return !mca_cfg.ignore_ce && iv;
+}
+
+static void mce_timer_fn(struct timer_list *t)
+{
+ struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
+ unsigned long iv;
+
+ WARN_ON(cpu_t != t);
+
+ iv = __this_cpu_read(mce_next_interval);
+
+ if (mce_available(this_cpu_ptr(&cpu_info)))
+ mc_poll_banks();
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
+ */
+ if (mce_notify_irq())
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ else
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else if (should_enable_timer(iv)) {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+/*
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
+ */
+void mce_timer_kick(bool storm)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_set_storm_mode(storm);
+
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
+}
+
+/* Must not be called in IRQ context where timer_delete_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ timer_delete_sync(&per_cpu(mce_timer, cpu));
+}
+
+static void __mcheck_cpu_mce_banks_init(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u8 n_banks = this_cpu_read(mce_num_banks);
+ int i;
+
+ for (i = 0; i < n_banks; i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ /*
+ * Init them all by default.
+ *
+ * The required vendor quirks will be applied before
+ * __mcheck_cpu_init_prepare_banks() does the final bank setup.
+ */
+ b->ctl = -1ULL;
+ b->init = true;
+ }
+}
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static void __mcheck_cpu_cap_init(void)
+{
+ u64 cap;
+ u8 b;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ b = cap & MCG_BANKCNT_MASK;
+
+ if (b > MAX_NR_BANKS) {
+ pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
+ smp_processor_id(), MAX_NR_BANKS, b);
+ b = MAX_NR_BANKS;
+ }
+
+ this_cpu_write(mce_num_banks, b);
+
+ __mcheck_cpu_mce_banks_init();
+}
+
+static void __mcheck_cpu_init_generic(void)
+{
+ u64 cap;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+}
+
+static void __mcheck_cpu_init_prepare_banks(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ u64 msrval;
+ int i;
+
+ /*
+ * Log the machine checks left over from the previous reset. Log them
+ * only, do not start processing them. That will happen in mcheck_late_init()
+ * when all consumers have been registered on the notifier chain.
+ */
+ if (mca_cfg.bootlog) {
+ mce_banks_t all_banks;
+
+ bitmap_fill(all_banks, MAX_NR_BANKS);
+ machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks);
+ }
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (!b->init)
+ continue;
+
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ wrmsrq(mca_msr_reg(i, MCA_STATUS), 0);
+
+ rdmsrq(mca_msr_reg(i, MCA_CTL), msrval);
+ b->init = !!msrval;
+ }
+}
+
+static void amd_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
+ /*
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
+ */
+ mca_cfg.bootlog = 0;
+ }
+
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
+
+static void intel_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
+
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+{
+ if (c->x86 != 5)
+ return false;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ intel_p5_mcheck_init(c);
+ mce_flags.p5 = 1;
+ return true;
+ case X86_VENDOR_CENTAUR:
+ winchip_mcheck_init(c);
+ mce_flags.winchip = 1;
+ return true;
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ /*
+ * All newer Centaur CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
+ c->x86 > 6) {
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
+ }
+}
+
+static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+
+ /*
+ * These CPUs have MCA bank 8 which reports only one error type called
+ * SVAD (System View Address Decoder). The reporting of that error is
+ * controlled by IA32_MC8.CTL.0.
+ *
+ * If enabled, prefetching on these CPUs will cause SVAD MCE when
+ * virtual machines start and result in a system panic. Always disable
+ * bank 8 SVAD error by default.
+ */
+ if ((c->x86 == 7 && c->x86_model == 0x1b) ||
+ (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (this_cpu_read(mce_num_banks) > 8)
+ mce_banks[8].ctl = 0;
+ }
+
+ intel_init_cmci();
+ intel_init_lmce();
+}
+
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_HYGON:
+ mce_amd_feature_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ mce_centaur_feature_init(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_init(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void mce_start_timer(struct timer_list *t)
+{
+ unsigned long iv = check_interval * HZ;
+
+ if (should_enable_timer(iv)) {
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ timer_setup(t, mce_timer_fn, TIMER_PINNED);
+ mce_start_timer(t);
+}
+
+bool filter_mce(struct mce *m)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return amd_filter_mce(m);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return intel_filter_mce(m);
+
+ return false;
+}
+
+static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ WARN_ON_ONCE(user_mode(regs));
+
+ /*
+ * Only required when from kernel mode. See
+ * mce_check_crashing_cpu() for details.
+ */
+ if (mca_cfg.initialized && mce_check_crashing_cpu())
+ return;
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ do_machine_check(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+}
+
+static __always_inline void exc_machine_check_user(struct pt_regs *regs)
+{
+ irqentry_enter_from_user_mode(regs);
+
+ do_machine_check(regs);
+
+ irqentry_exit_to_user_mode(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* MCE hit kernel mode */
+DEFINE_IDTENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+
+/* The user mode variant. */
+DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ exc_machine_check_user(regs);
+ local_db_restore(dr7);
+}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+#else
+/* 32bit unified entry point */
+DEFINE_IDTENTRY_RAW(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
+
+void mca_bsp_init(struct cpuinfo_x86 *c)
+{
+ u64 cap;
+
+ if (!mce_available(c))
+ return;
+
+ if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ mca_cfg.disabled = 1;
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return;
+ }
+
+ mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV);
+ mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR);
+ mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA);
+
+ if (mce_flags.smca)
+ smca_bsp_init();
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+
+ /* Use accurate RIP reporting if available. */
+ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
+
+ if (cap & MCG_SER_P)
+ mca_cfg.ser = 1;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ amd_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_INTEL:
+ intel_apply_global_quirks(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ zhaoxin_apply_global_quirks(c);
+ break;
+ }
+
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = 0;
+ if (mca_cfg.bootlog != 0)
+ mca_cfg.panic_timeout = 30;
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off:
+ */
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (__mcheck_cpu_ancient_init(c))
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ __mcheck_cpu_cap_init();
+
+ if (!mce_gen_pool_init()) {
+ mca_cfg.disabled = 1;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
+ mca_cfg.initialized = 1;
+
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(c);
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_setup_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
+}
+
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= this_cpu_read(mce_num_banks)) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
+/*
+ * mce=off Disables machine check
+ * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
+ * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
+ * mce=print_all Print all machine check logs to console
+ * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
+ * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
+ * monarchtimeout is how long to wait for other CPUs on machine
+ * check, or 0 to not wait
+ * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
+ and older.
+ * mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable copy_mc_fragile()
+ */
+static int __init mcheck_enable(char *str)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ if (*str == 0) {
+ enable_p5_mce();
+ return 1;
+ }
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ cfg->disabled = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = 1;
+ else if (!strcmp(str, "dont_log_ce"))
+ cfg->dont_log_ce = true;
+ else if (!strcmp(str, "print_all"))
+ cfg->print_all = true;
+ else if (!strcmp(str, "ignore_ce"))
+ cfg->ignore_ce = true;
+ else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = 1;
+ else if (!strcmp(str, "recovery"))
+ cfg->recovery = 1;
+ else if (isdigit(str[0]))
+ get_option(&str, &(cfg->monarch_timeout));
+ else {
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
+ return 0;
+ }
+ return 1;
+}
+__setup("mce", mcheck_enable);
+
+int __init mcheck_init(void)
+{
+ mce_register_decode_chain(&early_nb);
+ mce_register_decode_chain(&mce_uc_nb);
+ mce_register_decode_chain(&mce_default_nb);
+
+ INIT_WORK(&mce_work, mce_gen_pool_process);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
+ return 0;
+}
+
+/*
+ * mce_syscore: PM support
+ */
+
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static void mce_disable_error_reporting(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), 0);
+ }
+ return;
+}
+
+static void vendor_disable_error_reporting(void)
+{
+ /*
+ * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these
+ * MSRs are socket-wide. Disabling them for just a single offlined CPU
+ * is bad, since it will inhibit reporting for all shared resources on
+ * the socket like the last level cache (LLC), the integrated memory
+ * controller (iMC), etc.
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
+ return;
+
+ mce_disable_error_reporting();
+}
+
+static int mce_syscore_suspend(void *data)
+{
+ vendor_disable_error_reporting();
+ return 0;
+}
+
+static void mce_syscore_shutdown(void *data)
+{
+ vendor_disable_error_reporting();
+}
+
+/*
+ * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
+ * Only one CPU is active at this time, the others get re-added later using
+ * CPU hotplug:
+ */
+static void mce_syscore_resume(void *data)
+{
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_prepare_banks();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+static const struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+static struct syscore mce_syscore = {
+ .ops = &mce_syscore_ops,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
+static void mce_cpu_restart(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ __mcheck_cpu_init_generic();
+ __mcheck_cpu_init_prepare_banks();
+ __mcheck_cpu_init_timer();
+ cr4_set_bits(X86_CR4_MCE);
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ mce_timer_delete_all();
+ on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
+}
+
+/* Toggle features for corrected errors */
+static void mce_disable_cmci(void *data)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_clear();
+}
+
+static void mce_enable_ce(void *all)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+ cmci_reenable();
+ cmci_recheck();
+ if (all)
+ __mcheck_cpu_init_timer();
+}
+
+static const struct bus_type mce_subsys = {
+ .name = "machinecheck",
+ .dev_name = "machinecheck",
+};
+
+DEFINE_PER_CPU(struct device *, mce_device);
+
+static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
+{
+ return container_of(attr, struct mce_bank_dev, attr);
+}
+
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
+ char *buf)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+
+ if (!b->init)
+ return -ENODEV;
+
+ return sprintf(buf, "%llx\n", b->ctl);
+}
+
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u8 bank = attr_to_bank(attr)->bank;
+ struct mce_bank *b;
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ if (bank >= per_cpu(mce_num_banks, s->id))
+ return -EINVAL;
+
+ b = &per_cpu(mce_banks_array, s->id)[bank];
+ if (!b->init)
+ return -ENODEV;
+
+ b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.ignore_ce ^ !!new) {
+ if (new) {
+ /* disable ce features */
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
+ } else {
+ /* enable ce features */
+ mca_cfg.ignore_ce = false;
+ on_each_cpu(mce_enable_ce, (void *)1, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ u64 new;
+
+ if (kstrtou64(buf, 0, &new) < 0)
+ return -EINVAL;
+
+ mutex_lock(&mce_sysfs_mutex);
+ if (mca_cfg.cmci_disabled ^ !!new) {
+ if (new) {
+ /* disable cmci */
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
+ } else {
+ /* enable cmci */
+ mca_cfg.cmci_disabled = false;
+ on_each_cpu(mce_enable_ce, NULL, 1);
+ }
+ }
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return size;
+}
+
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long old_check_interval = check_interval;
+ ssize_t ret = device_store_ulong(s, attr, buf, size);
+
+ if (check_interval == old_check_interval)
+ return ret;
+
+ mutex_lock(&mce_sysfs_mutex);
+ mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
+
+ return ret;
+}
+
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
+static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
+
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
+ &check_interval
+};
+
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
+};
+
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
+};
+
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
+ &dev_attr_trigger,
+#endif
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_print_all.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
+ NULL
+};
+
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/* Per CPU device init. All of the CPUs still share the same bank device: */
+static int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
+ int err;
+ int i, j;
+
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
+
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
+ return err;
+ }
+
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
+ if (err)
+ goto error;
+ }
+ for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
+ err = device_create_file(dev, &mce_bank_devs[j].attr);
+ if (err)
+ goto error2;
+ }
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
+
+ return 0;
+error2:
+ while (--j >= 0)
+ device_remove_file(dev, &mce_bank_devs[j].attr);
+error:
+ while (--i >= 0)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ device_unregister(dev);
+
+ return err;
+}
+
+static void mce_device_remove(unsigned int cpu)
+{
+ struct device *dev = per_cpu(mce_device, cpu);
+ int i;
+
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
+ return;
+
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
+
+ for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
+ device_remove_file(dev, &mce_bank_devs[i].attr);
+
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
+}
+
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void)
+{
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_clear();
+
+ vendor_disable_error_reporting();
+}
+
+static void mce_reenable_cpu(void)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
+ int i;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
+ return;
+
+ if (!cpuhp_tasks_frozen)
+ cmci_reenable();
+ for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
+ struct mce_bank *b = &mce_banks[i];
+
+ if (b->init)
+ wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl);
+ }
+}
+
+static int mce_cpu_dead(unsigned int cpu)
+{
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_device_create(cpu);
+ mce_threshold_create_device(cpu);
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
+}
+
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ timer_delete_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
+
+static __init void mce_init_banks(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_BANKS; i++) {
+ struct mce_bank_dev *b = &mce_bank_devs[i];
+ struct device_attribute *a = &b->attr;
+
+ b->bank = i;
+
+ sysfs_attr_init(&a->attr);
+ a->attr.name = b->attrname;
+ snprintf(b->attrname, ATTR_LEN, "bank%d", i);
+
+ a->attr.mode = 0644;
+ a->show = show_bank;
+ a->store = set_bank;
+ }
+}
+
+/*
+ * When running on XEN, this initcall is ordered against the XEN mcelog
+ * initcall:
+ *
+ * device_initcall(xen_late_init_mcelog);
+ * device_initcall_sync(mcheck_init_device);
+ */
+static __init int mcheck_init_device(void)
+{
+ int err;
+
+ /*
+ * Check if we have a spare virtual bit. This will only become
+ * a problem if/when we move beyond 5-level page tables.
+ */
+ MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
+
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
+
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ mce_init_banks();
+
+ err = subsys_system_register(&mce_subsys, NULL);
+ if (err)
+ goto err_out_mem;
+
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
+
+ /*
+ * Invokes mce_cpu_online() on all CPUs which are online when
+ * the state is installed.
+ */
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+
+ register_syscore(&mce_syscore);
+
+ return 0;
+
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
+
+ return err;
+}
+device_initcall_sync(mcheck_init_device);
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+static int __init mcheck_disable(char *str)
+{
+ mca_cfg.disabled = 1;
+ return 1;
+}
+__setup("nomce", mcheck_disable);
+
+#ifdef CONFIG_DEBUG_FS
+struct dentry *mce_get_debugfs_dir(void)
+{
+ static struct dentry *dmce;
+
+ if (!dmce)
+ dmce = debugfs_create_dir("mce", NULL);
+
+ return dmce;
+}
+
+static void mce_reset(void)
+{
+ atomic_set(&mce_fake_panicked, 0);
+ atomic_set(&mce_executing, 0);
+ atomic_set(&mce_callin, 0);
+ atomic_set(&global_nwo, 0);
+ cpumask_setall(&mce_missing_cpus);
+}
+
+static int fake_panic_get(void *data, u64 *val)
+{
+ *val = fake_panic;
+ return 0;
+}
+
+static int fake_panic_set(void *data, u64 val)
+{
+ mce_reset();
+ fake_panic = val;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
+ "%llu\n");
+
+static void __init mcheck_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+ debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
+ &fake_panic_fops);
+}
+#else
+static void __init mcheck_debugfs_init(void) { }
+#endif
+
+static int __init mcheck_late_init(void)
+{
+ if (mca_cfg.recovery)
+ enable_copy_mc_fragile();
+
+ mcheck_debugfs_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
new file mode 100644
index 000000000000..8d023239ce18
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -0,0 +1,365 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "internal.h"
+
+static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer *mcelog;
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int entry;
+
+ if (mce->kflags & MCE_HANDLED_CEC)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ entry = mcelog->next;
+
+ /*
+ * When the buffer fills up discard new entries. Assume that the
+ * earlier errors are the more interesting ones:
+ */
+ if (entry >= mcelog->len) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
+ goto unlock;
+ }
+
+ mcelog->next = entry + 1;
+
+ memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
+ mcelog->entry[entry].finished = 1;
+ mcelog->entry[entry].kflags = 0;
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+unlock:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ mce->kflags |= MCE_HANDLED_MCELOG;
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strscpy(mce_helper, buf, sizeof(mce_helper));
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned next;
+ int i, err;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
+ goto out;
+
+ next = mcelog->next;
+ err = 0;
+
+ for (i = 0; i < next; i++) {
+ struct mce *m = &mcelog->entry[i];
+
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+ }
+
+ memset(mcelog->entry, 0, next * sizeof(struct mce));
+ mcelog->next = 0;
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+
+ return err ? err : buf - ubuf;
+}
+
+static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog->next))
+ return EPOLLIN | EPOLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return EPOLLIN | EPOLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(mcelog->len, p);
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
+ default:
+ return -ENOTTY;
+ }
+}
+
+void mce_register_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_register(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_injector_chain);
+
+void mce_unregister_injector_chain(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&mce_injector_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ struct mce m;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * There are some cases where real MSR reads could slip
+ * through.
+ */
+ if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
+ return -EIO;
+
+ if ((unsigned long)usize > sizeof(struct mce))
+ usize = sizeof(struct mce);
+ if (copy_from_user(&m, ubuf, usize))
+ return -EFAULT;
+
+ if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
+ return -EINVAL;
+
+ /*
+ * Need to give user space some time to set everything up,
+ * so do it a jiffy or two later everywhere.
+ */
+ schedule_timeout(2);
+
+ blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
+
+ return usize;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int mce_log_len;
+ int err;
+
+ mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
+ if (!mcelog)
+ return -ENOMEM;
+
+ memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ mcelog->len = mce_log_len;
+ mcelog->recordlen = sizeof(struct mce);
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ if (err == -EBUSY)
+ /* Xen dom0 might have registered the device already. */
+ pr_info("Unable to init device /dev/mcelog, already registered");
+ else
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+
+ kfree(mcelog);
+ return err;
+ }
+
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
new file mode 100644
index 000000000000..3ca9c007a666
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
+ */
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+
+/*
+ * Compare the record "t" with each of the records on list "l" to see if
+ * an equivalent one is present in the list.
+ */
+static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
+{
+ struct mce_hw_err *err1, *err2;
+ struct mce_evt_llist *node;
+
+ err1 = &t->err;
+
+ llist_for_each_entry(node, &l->llnode, llnode) {
+ err2 = &node->err;
+
+ if (!mce_cmp(&err1->m, &err2->m))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * The system has panicked - we'd like to peruse the list of MCE records
+ * that have been queued, but not seen by anyone yet. The list is in
+ * reverse time order, so we need to reverse it. While doing that we can
+ * also drop duplicate records (these were logged because some banks are
+ * shared between cores or by all threads on a socket).
+ */
+struct llist_node *mce_gen_pool_prepare_records(void)
+{
+ struct llist_node *head;
+ LLIST_HEAD(new_head);
+ struct mce_evt_llist *node, *t;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return NULL;
+
+ /* squeeze out duplicates while reversing order */
+ llist_for_each_entry_safe(node, t, head, llnode) {
+ if (!is_duplicate_mce_record(node, t))
+ llist_add(&node->llnode, &new_head);
+ }
+
+ return new_head.first;
+}
+
+void mce_gen_pool_process(struct work_struct *__unused)
+{
+ struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry_safe(node, tmp, head, llnode) {
+ mce = &node->err.m;
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+bool mce_gen_pool_add(struct mce_hw_err *err)
+{
+ struct mce_evt_llist *node;
+
+ if (filter_mce(&err->m))
+ return false;
+
+ if (!mce_evt_pool)
+ return false;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return false;
+ }
+
+ memcpy(&node->err, err, sizeof(*err));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return true;
+}
+
+static bool mce_gen_pool_create(void)
+{
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
+
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
+ }
+
+ mce_evt_pool = gpool;
+
+ return true;
+}
+
+bool mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return true;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
new file mode 100644
index 000000000000..d02c4f556cd0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -0,0 +1,805 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Machine check injection support.
+ * Copyright 2008 Intel Corporation.
+ *
+ * Authors:
+ * Andi Kleen
+ * Ying Huang
+ *
+ * The AMD part (from mce_amd_inj.c): a simple MCE injection facility
+ * for testing different aspects of the RAS code. This driver should be
+ * built as module so that it can be loaded on production kernels for
+ * testing purposes.
+ *
+ * Copyright (c) 2010-17: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+
+#include <asm/amd/nb.h>
+#include <asm/apic.h>
+#include <asm/irq_vectors.h>
+#include <asm/mce.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+
+#include "internal.h"
+
+static bool hw_injection_possible = true;
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+#define MAX_FLAG_OPT_SIZE 4
+#define NBCFG 0x44
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ DFR_INT_INJ, /* Trigger Deferred error interrupt */
+ THR_INT_INJ, /* Trigger threshold interrupt */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ [DFR_INT_INJ] = "df",
+ [THR_INT_INJ] = "th",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+
+/* Use the user provided IPID value on a sw injection. */
+static int inj_ipid_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ if (inj_type == SW_INJ)
+ m->ipid = val;
+ }
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
+
+static void setup_inj_struct(struct mce *m)
+{
+ memset(m, 0, sizeof(struct mce));
+
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->time = ktime_get_real_seconds();
+ m->cpuid = cpuid_eax(1);
+ m->microcode = boot_cpu_data.microcode;
+}
+
+/* Update fake mce registers on current CPU. */
+static void inject_mce(struct mce *m)
+{
+ struct mce *i = &per_cpu(injectm, m->extcpu);
+
+ /* Make sure no one reads partially written injectm */
+ i->finished = 0;
+ mb();
+ m->finished = 0;
+ /* First set the fields after finished */
+ i->extcpu = m->extcpu;
+ mb();
+ /* Now write record in order, finished last (except above) */
+ memcpy(i, m, sizeof(struct mce));
+ /* Finally activate it */
+ mb();
+ i->finished = 1;
+}
+
+static void raise_poll(struct mce *m)
+{
+ unsigned long flags;
+ mce_banks_t b;
+
+ memset(&b, 0xff, sizeof(mce_banks_t));
+ local_irq_save(flags);
+ machine_check_poll(0, &b);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static void raise_exception(struct mce *m, struct pt_regs *pregs)
+{
+ struct pt_regs regs;
+ unsigned long flags;
+
+ if (!pregs) {
+ memset(&regs, 0, sizeof(struct pt_regs));
+ regs.ip = m->ip;
+ regs.cs = m->cs;
+ pregs = &regs;
+ }
+ /* do_machine_check() expects interrupts disabled -- at least */
+ local_irq_save(flags);
+ do_machine_check(pregs);
+ local_irq_restore(flags);
+ m->finished = 0;
+}
+
+static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
+
+static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+ if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
+ return NMI_DONE;
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ if (m->inject_flags & MCJ_EXCEPTION)
+ raise_exception(m, regs);
+ else if (m->status)
+ raise_poll(m);
+ return NMI_HANDLED;
+}
+
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = this_cpu_ptr(&injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
+/* Inject mce on current CPU */
+static int raise_local(void)
+{
+ struct mce *m = this_cpu_ptr(&injectm);
+ int context = MCJ_CTX(m->inject_flags);
+ int ret = 0;
+ int cpu = m->extcpu;
+
+ if (m->inject_flags & MCJ_EXCEPTION) {
+ pr_info("Triggering MCE exception on CPU %d\n", cpu);
+ switch (context) {
+ case MCJ_CTX_IRQ:
+ /*
+ * Could do more to fake interrupts like
+ * calling irq_enter, but the necessary
+ * machinery isn't exported currently.
+ */
+ fallthrough;
+ case MCJ_CTX_PROCESS:
+ raise_exception(m, NULL);
+ break;
+ default:
+ pr_info("Invalid MCE context\n");
+ ret = -EINVAL;
+ }
+ pr_info("MCE exception done on CPU %d\n", cpu);
+ } else if (m->status) {
+ pr_info("Starting machine check poll CPU %d\n", cpu);
+ raise_poll(m);
+ pr_info("Machine check poll done on CPU %d\n", cpu);
+ } else
+ m->finished = 0;
+
+ return ret;
+}
+
+static void __maybe_unused raise_mce(struct mce *m)
+{
+ int context = MCJ_CTX(m->inject_flags);
+
+ inject_mce(m);
+
+ if (context == MCJ_CTX_RANDOM)
+ return;
+
+ if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
+ unsigned long start;
+ int cpu;
+
+ cpus_read_lock();
+ cpumask_copy(mce_inject_cpumask, cpu_online_mask);
+ cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
+ for_each_online_cpu(cpu) {
+ struct mce *mcpu = &per_cpu(injectm, cpu);
+ if (!mcpu->finished ||
+ MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ }
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BROADCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ __apic_send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+ }
+ start = jiffies;
+ while (!cpumask_empty(mce_inject_cpumask)) {
+ if (!time_before(jiffies, start + 2*HZ)) {
+ pr_err("Timeout waiting for mce inject %lx\n",
+ *cpumask_bits(mce_inject_cpumask));
+ break;
+ }
+ cpu_relax();
+ }
+ raise_local();
+ put_cpu();
+ cpus_read_unlock();
+ } else {
+ preempt_disable();
+ raise_local();
+ preempt_enable();
+ }
+}
+
+static int mce_inject_raise(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ mutex_lock(&mce_inject_mutex);
+ raise_mce(m);
+ mutex_unlock(&mce_inject_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block inject_nb = {
+ .notifier_call = mce_inject_raise,
+};
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ if (i > SW_INJ && !hw_injection_possible)
+ continue;
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+
+ if (!cnt || cnt > MAX_FLAG_OPT_SIZE)
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += cnt;
+
+ return cnt;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void trigger_dfr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (DEFERRED_ERROR_VECTOR));
+}
+
+static void trigger_thr_int(void *info)
+{
+ asm volatile("int %0" :: "i" (THRESHOLD_APIC_VECTOR));
+}
+
+static u32 get_nbc_for_node(int node_id)
+{
+ u32 cores_per_node;
+
+ cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg();
+ return cores_per_node * node_id;
+}
+
+static void toggle_nb_mca_mst_cpu(u16 nid)
+{
+ struct amd_northbridge *nb;
+ struct pci_dev *F3;
+ u32 val;
+ int err;
+
+ nb = node_to_amd_nb(nid);
+ if (!nb)
+ return;
+
+ F3 = nb->misc;
+ if (!F3)
+ return;
+
+ err = pci_read_config_dword(F3, NBCFG, &val);
+ if (err) {
+ pr_err("%s: Error reading F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+ return;
+ }
+
+ if (val & BIT(27))
+ return;
+
+ pr_err("%s: Set D18F3x44[NbMcaToMstCpuEn] which BIOS hasn't done.\n",
+ __func__);
+
+ val |= BIT(27);
+ err = pci_write_config_dword(F3, NBCFG, val);
+ if (err)
+ pr_err("%s: Error writing F%dx%03x.\n",
+ __func__, PCI_FUNC(F3->devfn), NBCFG);
+}
+
+static void prepare_msrs(void *info)
+{
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
+
+ wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus);
+
+ if (boot_cpu_has(X86_FEATURE_SMCA)) {
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
+ } else {
+ wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
+ }
+
+ wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ } else {
+ wrmsrq(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr);
+
+ if (m.misc)
+ wrmsrq(MSR_IA32_MCx_MISC(b), m.misc);
+ }
+}
+
+static void do_inject(void)
+{
+ unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
+ u8 b = i_mce.bank;
+
+ i_mce.tsc = rdtsc_ordered();
+
+ i_mce.status |= MCI_STATUS_VAL;
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
+ if (inj_type == SW_INJ) {
+ err.m = i_mce;
+ mce_log(&err);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ /*
+ * Ensure necessary status bits for deferred errors:
+ * - MCx_STATUS[Deferred]: make sure it is a deferred error
+ * - MCx_STATUS[UC] cleared: deferred errors are _not_ UC
+ */
+ if (inj_type == DFR_INT_INJ) {
+ i_mce.status |= MCI_STATUS_DEFERRED;
+ i_mce.status &= ~MCI_STATUS_UC;
+ }
+
+ /*
+ * For multi node CPUs, logging and reporting of bank 4 errors happens
+ * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
+ * Fam10h and later BKDGs.
+ */
+ if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
+ toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu));
+ cpu = get_nbc_for_node(topology_amd_node_id(cpu));
+ }
+
+ cpus_read_lock();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ i_mce.mcgstatus = mcg_status;
+ i_mce.inject_flags = inj_type;
+ smp_call_function_single(cpu, prepare_msrs, &i_mce, 0);
+
+ toggle_hw_mce_inject(cpu, false);
+
+ switch (inj_type) {
+ case DFR_INT_INJ:
+ smp_call_function_single(cpu, trigger_dfr_int, NULL, 0);
+ break;
+ case THR_INT_INJ:
+ smp_call_function_single(cpu, trigger_thr_int, NULL, 0);
+ break;
+ default:
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+ }
+
+err:
+ cpus_read_unlock();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+ u8 n_banks;
+ u64 cap;
+
+ /* Get bank count on target CPU so we can handle non-uniform values. */
+ rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ if (val >= n_banks) {
+ pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+
+ /*
+ * sw-only injection allows to write arbitrary values into the MCA
+ * registers because it tests only the decoding paths.
+ */
+ if (inj_type == SW_INJ)
+ goto inject;
+
+ /*
+ * Read IPID value to determine if a bank is populated on the target
+ * CPU.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ u64 ipid;
+
+ if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) {
+ pr_err("Error reading IPID on CPU%d\n", m->extcpu);
+ return -EINVAL;
+ }
+
+ if (!ipid) {
+ pr_err("Cannot inject into unpopulated bank %llu\n", val);
+ return -ENODEV;
+ }
+ }
+
+inject:
+ do_inject();
+
+ /* Reset injection struct */
+ setup_inj_struct(&i_mce);
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\t - \"df\": Trigger APIC interrupt for Deferred error. Causes deferred \n"
+"\t error APIC interrupt handler to handle the error if the feature is \n"
+"\t is present in hardware. \n"
+"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
+"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static void __init debugfs_init(void)
+{
+ unsigned int i;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+ debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj,
+ &i_mce, dfs_fls[i].fops);
+}
+
+static void check_hw_inj_possible(void)
+{
+ int cpu;
+ u8 bank;
+
+ /*
+ * This behavior exists only on SMCA systems though its not directly
+ * related to SMCA.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SMCA))
+ return;
+
+ cpu = get_cpu();
+
+ for (bank = 0; bank < MAX_NR_BANKS; ++bank) {
+ u64 status = MCI_STATUS_VAL, ipid;
+
+ /* Check whether bank is populated */
+ rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ if (!ipid)
+ continue;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0);
+
+ if (!status) {
+ hw_injection_possible = false;
+ pr_warn("Platform does not allow *hardware* error injection."
+ "Try using APEI EINJ instead.\n");
+ }
+
+ toggle_hw_mce_inject(cpu, false);
+
+ break;
+ }
+
+ put_cpu();
+}
+
+static int __init inject_init(void)
+{
+ if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ check_hw_inj_possible();
+
+ debugfs_init();
+
+ register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
+ mce_register_injector_chain(&inject_nb);
+
+ setup_inj_struct(&i_mce);
+
+ pr_info("Machine check injector initialized\n");
+
+ return 0;
+}
+
+static void __exit inject_exit(void)
+{
+
+ mce_unregister_injector_chain(&inject_nb);
+ unregister_nmi_handler(NMI_LOCAL, "mce_notify");
+
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ free_cpumask_var(mce_inject_cpumask);
+}
+
+module_init(inject_init);
+module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
new file mode 100644
index 000000000000..4655223ba560
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#include <linux/gfp.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+
+#include "internal.h"
+
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+
+/*
+ * On systems that do support CMCI but it's disabled, polling for MCEs can
+ * cause the same event to be reported multiple times because IA32_MCi_STATUS
+ * is shared by the same package.
+ */
+static DEFINE_SPINLOCK(cmci_poll_lock);
+
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
+#define CMCI_THRESHOLD 1
+
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
+
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
+
+static bool cmci_supported(int *banks)
+{
+ u64 cap;
+
+ if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
+ return false;
+
+ /*
+ * Vendor check is not strictly needed, but the initial
+ * initialization is vendor keyed and this
+ * makes sure none of the backdoors are entered otherwise.
+ */
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return false;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, cap);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
+ return !!(cap & MCG_CMCI_P);
+}
+
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrq(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
+ */
+ rdmsrq(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
+
+ return tmp & FEAT_CTL_LMCE_ENABLED;
+}
+
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
+{
+ unsigned long flags;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+void mce_intel_handle_storm(int bank, bool on)
+{
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+}
+
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
+{
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+
+ if (test_bit(bank, owned))
+ return true;
+
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
+
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), *val);
+
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
+
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ return val;
+}
+
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
+ }
+
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
+
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
+
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
+ */
+static void cmci_discover(int banks)
+{
+ int bios_wrong_thresh = 0;
+ unsigned long flags;
+ int i;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++) {
+ u64 val;
+ int bios_zero_thresh = 0;
+
+ if (cmci_skip_bank(i, &val))
+ continue;
+
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+ unsigned long flags;
+ int banks;
+
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ return;
+
+ local_irq_save(flags);
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
+ local_irq_restore(flags);
+}
+
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
+ return;
+ rdmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrq(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+ unsigned long flags;
+ int i;
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+static void cmci_rediscover_work_func(void *arg)
+{
+ int banks;
+
+ /* Recheck banks in case CPUs don't all have the same */
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+/* After a CPU went down cycle through all the others and rediscover */
+void cmci_rediscover(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ on_each_cpu(cmci_rediscover_work_func, NULL, 1);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+ int banks;
+ if (cmci_supported(&banks))
+ cmci_discover(banks);
+}
+
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
+/* Bank polling function when CMCI is disabled. */
+static void cmci_mc_poll_banks(void)
+{
+ spin_lock(&cmci_poll_lock);
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
+ spin_unlock(&cmci_poll_lock);
+}
+
+void intel_init_cmci(void)
+{
+ int banks;
+
+ if (!cmci_supported(&banks)) {
+ mc_poll_banks = cmci_mc_poll_banks;
+ return;
+ }
+
+ mce_threshold_vector = intel_threshold_interrupt;
+ cmci_discover(banks);
+ /*
+ * For CPU #0 this runs with still disabled APIC, but that's
+ * ok because only the vector is set up. We still do another
+ * check for the banks later for CPU #0 just to make sure
+ * to not miss any events.
+ */
+ apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+ cmci_recheck();
+}
+
+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
+void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrq(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrq(MSR_IA32_MCG_EXT_CTL, val);
+}
+
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+ u64 error_control;
+
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
+ if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control))
+ return;
+ error_control |= 2;
+ wrmsrq_safe(MSR_ERROR_CONTROL, error_control);
+ break;
+ }
+}
+
+static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ *
+ * Older CPUs (prior to family 6) can't reach this point and already
+ * return early due to the check of __mcheck_cpu_ancient_init().
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ this_cpu_ptr(mce_banks_array)[0].init = false;
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_apply_cpu_quirks(c);
+ intel_init_cmci();
+ intel_init_lmce();
+ intel_imc_init(c);
+}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+ cmci_clear();
+}
+
+bool intel_filter_mce(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
+ (m->bank == 0) &&
+ ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if the address reported by the CPU is in a format we can parse.
+ * It would be possible to add code for most other cases, but all would
+ * be somewhat complicated (e.g. segment offset would require an instruction
+ * parser). So only support physical addresses up to page granularity for now.
+ */
+bool intel_mce_usable_address(struct mce *m)
+{
+ if (!(m->status & MCI_STATUS_MISCV))
+ return false;
+
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
+ return false;
+
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
new file mode 100644
index 000000000000..a31cf984619c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -0,0 +1,352 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __X86_MCE_INTERNAL_H__
+#define __X86_MCE_INTERNAL_H__
+
+#undef pr_fmt
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/device.h>
+#include <asm/mce.h>
+
+enum severity_level {
+ MCE_NO_SEVERITY,
+ MCE_DEFERRED_SEVERITY,
+ MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
+ MCE_KEEP_SEVERITY,
+ MCE_SOME_SEVERITY,
+ MCE_AO_SEVERITY,
+ MCE_UC_SEVERITY,
+ MCE_AR_SEVERITY,
+ MCE_PANIC_SEVERITY,
+};
+
+extern struct blocking_notifier_head x86_mce_decoder_chain;
+
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
+
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce_hw_err err;
+};
+
+void mce_gen_pool_process(struct work_struct *__unused);
+bool mce_gen_pool_empty(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
+struct llist_node *mce_gen_pool_prepare_records(void);
+
+int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
+struct dentry *mce_get_debugfs_dir(void);
+
+extern mce_banks_t mce_banks_ce_disabled;
+
+#ifdef CONFIG_X86_MCE_INTEL
+void mce_intel_handle_storm(int bank, bool on);
+void cmci_disable_bank(int bank);
+void intel_init_cmci(void);
+void intel_init_lmce(void);
+void intel_clear_lmce(void);
+bool intel_filter_mce(struct mce *m);
+bool intel_mce_usable_address(struct mce *m);
+#else
+static inline void mce_intel_handle_storm(int bank, bool on) { }
+static inline void cmci_disable_bank(int bank) { }
+static inline void intel_init_cmci(void) { }
+static inline void intel_init_lmce(void) { }
+static inline void intel_clear_lmce(void) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; }
+static inline bool intel_mce_usable_address(struct mce *m) { return false; }
+#endif
+
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+u32 mce_get_apei_thr_limit(void);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+static inline u32 mce_get_apei_thr_limit(void) { return 0; }
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * We consider records to be equivalent if bank+status+addr+misc all match.
+ * This is only used when the system is going down because of a fatal error
+ * to avoid cluttering the console log with essentially repeated information.
+ * In normal processing all errors seen are logged.
+ */
+static inline bool mce_cmp(struct mce *m1, struct mce *m2)
+{
+ return m1->bank != m2->bank ||
+ m1->status != m2->status ||
+ m1->addr != m2->addr ||
+ m1->misc != m2->misc;
+}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+void mce_work_trigger(void);
+void mce_register_injector_chain(struct notifier_block *nb);
+void mce_unregister_injector_chain(struct notifier_block *nb);
+#else
+static inline void mce_work_trigger(void) { }
+static inline void mce_register_injector_chain(struct notifier_block *nb) { }
+static inline void mce_unregister_injector_chain(struct notifier_block *nb) { }
+#endif
+
+struct mca_config {
+ __u64 lmce_disabled : 1,
+ disabled : 1,
+ ser : 1,
+ recovery : 1,
+ bios_cmci_threshold : 1,
+ /* Proper #MC exception handler is set */
+ initialized : 1,
+ __reserved : 58;
+
+ bool dont_log_ce;
+ bool cmci_disabled;
+ bool ignore_ce;
+ bool print_all;
+
+ int monarch_timeout;
+ int panic_timeout;
+ u32 rip_msr;
+ s8 bootlog;
+};
+
+extern struct mca_config mca_cfg;
+DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
+
+struct mce_vendor_flags {
+ /*
+ * Indicates that overflow conditions are not fatal, when set.
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * (AMD) SUCCOR stands for S/W UnCorrectable error COntainment and
+ * Recovery. It indicates support for data poisoning in HW and deferred
+ * error interrupts.
+ */
+ succor : 1,
+
+ /*
+ * (AMD) SMCA: This bit indicates support for Scalable MCA which expands
+ * the register space for each MCA bank and also increases number of
+ * banks. Also, to accommodate the new banks and registers, the MCA
+ * register space is moved to a new MSR range.
+ */
+ smca : 1,
+
+ /* Zen IFU quirk */
+ zen_ifu_quirk : 1,
+
+ /* AMD-style error thresholding banks present. */
+ amd_threshold : 1,
+
+ /* Pentium, family 5-style MCA */
+ p5 : 1,
+
+ /* Centaur Winchip C6-style MCA */
+ winchip : 1,
+
+ /* SandyBridge IFU quirk */
+ snb_ifu_quirk : 1,
+
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 55;
+};
+
+extern struct mce_vendor_flags mce_flags;
+
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
+enum mca_msr {
+ MCA_CTL,
+ MCA_STATUS,
+ MCA_ADDR,
+ MCA_MISC,
+};
+
+/* Decide whether to add MCE record to MCE event pool or filter it out. */
+extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
+
+#ifdef CONFIG_X86_MCE_AMD
+void mce_threshold_create_device(unsigned int cpu);
+void mce_threshold_remove_device(unsigned int cpu);
+void mce_amd_handle_storm(unsigned int bank, bool on);
+extern bool amd_filter_mce(struct mce *m);
+bool amd_mce_usable_address(struct mce *m);
+void amd_clear_bank(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
+void smca_bsp_init(void);
+#else
+static inline void mce_threshold_create_device(unsigned int cpu) { }
+static inline void mce_threshold_remove_device(unsigned int cpu) { }
+static inline void mce_amd_handle_storm(unsigned int bank, bool on) { }
+static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline bool amd_mce_usable_address(struct mce *m) { return false; }
+static inline void amd_clear_bank(struct mce *m) { }
+static inline void smca_extract_err_addr(struct mce *m) { }
+static inline void smca_bsp_init(void) { }
+#endif
+
+#ifdef CONFIG_X86_ANCIENT_MCE
+void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
+void winchip_mcheck_init(struct cpuinfo_x86 *c);
+noinstr void pentium_machine_check(struct pt_regs *regs);
+noinstr void winchip_machine_check(struct pt_regs *regs);
+static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
+#else
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
+#endif
+
+noinstr u64 mce_rdmsrq(u32 msr);
+noinstr void mce_wrmsrq(u32 msr, u64 v);
+
+static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
+{
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
+ switch (reg) {
+ case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_AMD64_SMCA_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_AMD64_SMCA_MCx_STATUS(bank);
+ }
+ }
+
+ switch (reg) {
+ case MCA_CTL: return MSR_IA32_MCx_CTL(bank);
+ case MCA_ADDR: return MSR_IA32_MCx_ADDR(bank);
+ case MCA_MISC: return MSR_IA32_MCx_MISC(bank);
+ case MCA_STATUS: return MSR_IA32_MCx_STATUS(bank);
+ }
+
+ return 0;
+}
+
+extern void (*mc_poll_banks)(void);
+#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mce/p5.c
index 1c044b1ccc59..2272ad53fc33 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mce/p5.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* P5 specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,35 +6,39 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* By default disabled */
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
-static void pentium_machine_check(struct pt_regs *regs, long error_code)
+noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
+ instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
- printk(KERN_EMERG
- "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
- smp_processor_id(), loaddr, lotype);
+ pr_emerg("CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n",
+ smp_processor_id(), loaddr, lotype);
if (lotype & (1<<5)) {
- printk(KERN_EMERG
- "CPU#%d: Possible thermal failure (CPU on fire ?).\n",
- smp_processor_id());
+ pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+ smp_processor_id());
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */
@@ -49,19 +54,13 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_MCE))
return;
- machine_check_vector = pentium_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
/* Read registers before enabling: */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
- printk(KERN_INFO
- "Intel old style machine check architecture supported.\n");
+ pr_info("Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
- printk(KERN_INFO
- "Intel old style machine check reporting enabled on CPU#%d.\n",
- smp_processor_id());
+ cr4_set_bits(X86_CR4_MCE);
+ pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+ smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
new file mode 100644
index 000000000000..2235a7477436
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * MCE grading rules.
+ * Copyright 2008, 2009 Intel Corporation.
+ *
+ * Author: Andi Kleen
+ */
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include <asm/mce.h>
+#include <asm/cpu_device_id.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+
+#include "internal.h"
+
+/*
+ * Grade an mce by severity. In general the most severe ones are processed
+ * first. Since there are quite a lot of combinations test the bits in a
+ * table-driven way. The rules are simply processed in order, first
+ * match wins.
+ *
+ * Note this is only used for machine check exceptions, the corrected
+ * errors use much simpler rules. The exceptions still check for the corrected
+ * errors, but only to leave them alone for the CMCI handler (except for
+ * panic situations)
+ */
+
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
+enum ser { SER_REQUIRED = 1, NO_SER = 2 };
+enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
+
+static struct severity {
+ u64 mask;
+ u64 result;
+ unsigned char sev;
+ unsigned short mcgmask;
+ unsigned short mcgres;
+ unsigned char ser;
+ unsigned char context;
+ unsigned char excp;
+ unsigned char covered;
+ unsigned int cpu_vfm;
+ unsigned char cpu_minstepping;
+ unsigned char bank_lo, bank_hi;
+ char *msg;
+} severities[] = {
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
+#define KERNEL .context = IN_KERNEL
+#define USER .context = IN_USER
+#define KERNEL_RECOV .context = IN_KERNEL_RECOV
+#define SER .ser = SER_REQUIRED
+#define NOSER .ser = NO_SER
+#define EXCP .excp = EXCP_CONTEXT
+#define NOEXCP .excp = NO_EXCP
+#define BITCLR(x) .mask = x, .result = 0
+#define BITSET(x) .mask = x, .result = x
+#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
+#define MASK(x, y) .mask = x, .result = y
+#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
+#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
+#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
+#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
+
+ MCESEV(
+ NO, "Invalid",
+ BITCLR(MCI_STATUS_VAL)
+ ),
+ MCESEV(
+ NO, "Not enabled",
+ EXCP, BITCLR(MCI_STATUS_EN)
+ ),
+ MCESEV(
+ PANIC, "Processor context corrupt",
+ BITSET(MCI_STATUS_PCC)
+ ),
+ /* When MCIP is not set something is very confused */
+ MCESEV(
+ PANIC, "MCIP not set in MCA handler",
+ EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
+ ),
+ /* Neither return not error IP -- no chance to recover -> PANIC */
+ MCESEV(
+ PANIC, "Neither restart nor error IP",
+ EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ PANIC, "In kernel and no restart IP",
+ EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+ ),
+ MCESEV(
+ KEEP, "Corrected error",
+ NOSER, BITCLR(MCI_STATUS_UC)
+ ),
+ /*
+ * known AO MCACODs reported via MCE or CMC:
+ *
+ * SRAO could be signaled either via a machine check exception or
+ * CMCI with the corresponding bit S 1 or 0. So we don't need to
+ * check bit S for SRAO.
+ */
+ MCESEV(
+ AO, "Action optional: memory scrubbing error",
+ SER, MASK(MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
+ ),
+ MCESEV(
+ AO, "Action optional: last level cache writeback error",
+ SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
+ ),
+ /*
+ * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured
+ * to report uncorrected errors using CMCI with a special signature.
+ * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported
+ * in one of the memory controller banks.
+ * Set severity to "AO" for same action as normal patrol scrub error.
+ */
+ MCESEV(
+ AO, "Uncorrected Patrol Scrub Error",
+ SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ ),
+
+ /* ignore OVER for UCNA */
+ MCESEV(
+ UCNA, "Uncorrected no action required",
+ SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Illegal combination (UCNA with AR=1)",
+ SER,
+ MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+ ),
+ MCESEV(
+ KEEP, "Non signaled machine check",
+ SER, BITCLR(MCI_STATUS_S)
+ ),
+
+ MCESEV(
+ PANIC, "Action required with lost events",
+ SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+ ),
+
+ /* known AR MCACODs: */
+#ifdef CONFIG_MEMORY_FAILURE
+ MCESEV(
+ KEEP, "Action required but unaffected thread is continuable",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR),
+ MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
+ ),
+ MCESEV(
+ AR, "Action required: data load in error recoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL_RECOV
+ ),
+ MCESEV(
+ AR, "Action required: data load error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ USER
+ ),
+ MCESEV(
+ AR, "Action required: instruction fetch error in a user process",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ USER
+ ),
+ MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Data load in unrecoverable area of kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ KERNEL
+ ),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
+#endif
+ MCESEV(
+ PANIC, "Action required: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+ ),
+
+ MCESEV(
+ SOME, "Action optional: unknown MCACOD",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+ ),
+ MCESEV(
+ SOME, "Action optional with lost events",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+ ),
+
+ MCESEV(
+ PANIC, "Overflowed uncorrected",
+ BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+ ),
+ MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
+ UC, "Uncorrected",
+ BITSET(MCI_STATUS_UC)
+ ),
+ MCESEV(
+ SOME, "No match",
+ BITSET(0)
+ ) /* always matches. keep at end */
+};
+
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+ (MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
+static bool is_copy_from_user(struct pt_regs *regs)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ unsigned long addr;
+ struct insn insn;
+ int ret;
+
+ if (!regs)
+ return false;
+
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return false;
+
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return false;
+
+ switch (insn.opcode.value) {
+ /* MOV mem,reg */
+ case 0x8A: case 0x8B:
+ /* MOVZ mem,reg */
+ case 0xB60F: case 0xB70F:
+ addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ break;
+ /* REP MOVS */
+ case 0xA4: case 0xA5:
+ addr = regs->si;
+ break;
+ default:
+ return false;
+ }
+
+ if (fault_in_kernel_space(addr))
+ return false;
+
+ current->mce_vaddr = (void __user *)addr;
+
+ return true;
+}
+
+/*
+ * If mcgstatus indicated that ip/cs on the stack were
+ * no good, then "m->cs" will be zero and we will have
+ * to assume the worst case (IN_KERNEL) as we actually
+ * have no idea what we were executing when the machine
+ * check hit.
+ * If we do have a good "m->cs" (or a faked one in the
+ * case we were executing in VM86 mode) we can use it to
+ * distinguish an exception taken in user from from one
+ * taken in the kernel.
+ */
+static noinstr int error_context(struct mce *m, struct pt_regs *regs)
+{
+ int fixup_type;
+ bool copy_user;
+
+ if ((m->cs & 3) == 3)
+ return IN_USER;
+
+ if (!mc_recoverable(m->mcgstatus))
+ return IN_KERNEL;
+
+ /* Allow instrumentation around external facilities usage. */
+ instrumentation_begin();
+ fixup_type = ex_get_fixup_type(m->ip);
+ copy_user = is_copy_from_user(regs);
+ instrumentation_end();
+
+ if (copy_user) {
+ m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+ }
+
+ switch (fixup_type) {
+ case EX_TYPE_FAULT_MCE_SAFE:
+ case EX_TYPE_DEFAULT_MCE_SAFE:
+ m->kflags |= MCE_IN_KERNEL_RECOV;
+ return IN_KERNEL_RECOV;
+
+ default:
+ return IN_KERNEL;
+ }
+}
+
+/* See AMD PPR(s) section Machine Check Error Handling. */
+static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ char *panic_msg = NULL;
+ int ret;
+
+ /*
+ * Default return value: Action required, the error must be handled
+ * immediately.
+ */
+ ret = MCE_AR_SEVERITY;
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC) {
+ panic_msg = "Processor Context Corrupt";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (m->status & MCI_STATUS_DEFERRED) {
+ ret = MCE_DEFERRED_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * If the UC bit is not set, the system either corrected or deferred
+ * the error. No action will be required after logging the error.
+ */
+ if (!(m->status & MCI_STATUS_UC)) {
+ ret = MCE_KEEP_SEVERITY;
+ goto out;
+ }
+
+ /*
+ * On MCA overflow, without the MCA overflow recovery feature the
+ * system will not be able to recover, panic.
+ */
+ if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
+ panic_msg = "Overflowed uncorrected error without MCA Overflow Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (!mce_flags.succor) {
+ panic_msg = "Uncorrected error without MCA Recovery";
+ ret = MCE_PANIC_SEVERITY;
+ goto out;
+ }
+
+ if (error_context(m, regs) == IN_KERNEL) {
+ panic_msg = "Uncorrected unrecoverable error in kernel context";
+ ret = MCE_PANIC_SEVERITY;
+ }
+
+out:
+ if (msg && panic_msg)
+ *msg = panic_msg;
+
+ return ret;
+}
+
+static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
+ enum context ctx = error_context(m, regs);
+ struct severity *s;
+
+ for (s = severities;; s++) {
+ if ((m->status & s->mask) != s->result)
+ continue;
+ if ((m->mcgstatus & s->mcgmask) != s->mcgres)
+ continue;
+ if (s->ser == SER_REQUIRED && !mca_cfg.ser)
+ continue;
+ if (s->ser == NO_SER && mca_cfg.ser)
+ continue;
+ if (s->context && ctx != s->context)
+ continue;
+ if (s->excp && excp != s->excp)
+ continue;
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
+ continue;
+ if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
+ continue;
+ if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi))
+ continue;
+ if (msg)
+ *msg = s->msg;
+ s->covered = 1;
+
+ return s->sev;
+ }
+}
+
+int noinstr mce_severity(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+ return mce_severity_amd(m, regs, msg, is_excp);
+ else
+ return mce_severity_intel(m, regs, msg, is_excp);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void *s_start(struct seq_file *f, loff_t *pos)
+{
+ if (*pos >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void *s_next(struct seq_file *f, void *data, loff_t *pos)
+{
+ if (++(*pos) >= ARRAY_SIZE(severities))
+ return NULL;
+ return &severities[*pos];
+}
+
+static void s_stop(struct seq_file *f, void *data)
+{
+}
+
+static int s_show(struct seq_file *f, void *data)
+{
+ struct severity *ser = data;
+ seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
+ return 0;
+}
+
+static const struct seq_operations severities_seq_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int severities_coverage_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &severities_seq_ops);
+}
+
+static ssize_t severities_coverage_write(struct file *file,
+ const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE(severities); i++)
+ severities[i].covered = 0;
+ return count;
+}
+
+static const struct file_operations severities_coverage_fops = {
+ .open = severities_coverage_open,
+ .release = seq_release,
+ .read = seq_read,
+ .write = severities_coverage_write,
+ .llseek = seq_lseek,
+};
+
+static int __init severities_debugfs_init(void)
+{
+ struct dentry *dmce;
+
+ dmce = mce_get_debugfs_dir();
+
+ debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+ &severities_coverage_fops);
+ return 0;
+}
+late_initcall(severities_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
new file mode 100644
index 000000000000..0d13c9ffcba0
--- /dev/null
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/traps.h>
+#include <asm/apic.h>
+#include <asm/mce.h>
+#include <asm/trace/irq_vectors.h>
+
+#include "internal.h"
+
+static u32 mce_apei_thr_limit;
+
+void mce_save_apei_thr_limit(u32 thr_limit)
+{
+ mce_apei_thr_limit = thr_limit;
+ pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
+}
+
+u32 mce_get_apei_thr_limit(void)
+{
+ return mce_apei_thr_limit;
+}
+
+static void default_threshold_interrupt(void)
+{
+ pr_err("Unexpected threshold interrupt at vector %x\n",
+ THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
+{
+ trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
+ inc_irq_stat(irq_threshold_count);
+ mce_threshold_vector();
+ trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
+ apic_eoi();
+}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ case X86_VENDOR_AMD:
+ mce_amd_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ if (!mce_flags.amd_threshold)
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!--storm->stormy_bank_count)
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c
index e9a701aecaa1..6c99f2941909 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mce/winchip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* IDT Winchip specific Machine Check Exception Reporting
* (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
@@ -5,17 +6,23 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
+#include <linux/hardirq.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include "internal.h"
+
/* Machine check handler for WinChip C6: */
-static void winchip_machine_check(struct pt_regs *regs, long error_code)
+noinstr void winchip_machine_check(struct pt_regs *regs)
{
- printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+ instrumentation_begin();
+ pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+ instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */
@@ -23,17 +30,12 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
- machine_check_vector = winchip_machine_check;
- /* Make sure the vector pointer is visible before we enable MCEs: */
- wmb();
-
rdmsr(MSR_IDT_FCR1, lo, hi);
lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
- printk(KERN_INFO
- "Winchip machine check reporting enabled on CPU#0.\n");
+ pr_info("Winchip machine check reporting enabled on CPU#0.\n");
}
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
deleted file mode 100644
index bb34b03af252..000000000000
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-obj-y = mce.o mce-severity.o
-
-obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
-obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
-obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
-obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
-obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
-
-obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
-
-obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
deleted file mode 100644
index cd8b166a1735..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Bridge between MCE and APEI
- *
- * On some machine, corrected memory errors are reported via APEI
- * generic hardware error source (GHES) instead of corrected Machine
- * Check. These corrected memory errors can be reported to user space
- * through /dev/mcelog via faking a corrected Machine Check, so that
- * the error memory page can be offlined by /sbin/mcelog if the error
- * count for one page is beyond the threshold.
- *
- * For fatal MCE, save MCE record into persistent storage via ERST, so
- * that the MCE record can be logged after reboot via ERST.
- *
- * Copyright 2010 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/cper.h>
-#include <acpi/apei.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
-{
- struct mce m;
-
- /* Only corrected MC is reported */
- if (!corrected || !(mem_err->validation_bits &
- CPER_MEM_VALID_PHYSICAL_ADDRESS))
- return;
-
- mce_setup(&m);
- m.bank = 1;
- /* Fake a memory read corrected error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
- m.addr = mem_err->physical_addr;
- mce_log(&m);
- mce_notify_irq();
-}
-EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
-
-#define CPER_CREATOR_MCE \
- UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
-#define CPER_SECTION_TYPE_MCE \
- UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
-
-/*
- * CPER specification (in UEFI specification 2.3 appendix N) requires
- * byte-packed.
- */
-struct cper_mce_record {
- struct cper_record_header hdr;
- struct cper_section_descriptor sec_hdr;
- struct mce mce;
-} __packed;
-
-int apei_write_mce(struct mce *m)
-{
- struct cper_mce_record rcd;
-
- memset(&rcd, 0, sizeof(rcd));
- memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
- rcd.hdr.revision = CPER_RECORD_REV;
- rcd.hdr.signature_end = CPER_SIG_END;
- rcd.hdr.section_count = 1;
- rcd.hdr.error_severity = CPER_SEV_FATAL;
- /* timestamp, platform_id, partition_id are all invalid */
- rcd.hdr.validation_bits = 0;
- rcd.hdr.record_length = sizeof(rcd);
- rcd.hdr.creator_id = CPER_CREATOR_MCE;
- rcd.hdr.notification_type = CPER_NOTIFY_MCE;
- rcd.hdr.record_id = cper_next_record_id();
- rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
-
- rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
- rcd.sec_hdr.section_length = sizeof(rcd.mce);
- rcd.sec_hdr.revision = CPER_SEC_REV;
- /* fru_id and fru_text is invalid */
- rcd.sec_hdr.validation_bits = 0;
- rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
- rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
- rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
-
- memcpy(&rcd.mce, m, sizeof(*m));
-
- return erst_write(&rcd.hdr);
-}
-
-ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
- struct cper_mce_record rcd;
- int rc, pos;
-
- rc = erst_get_record_id_begin(&pos);
- if (rc)
- return rc;
-retry:
- rc = erst_get_record_id_next(&pos, record_id);
- if (rc)
- goto out;
- /* no more record */
- if (*record_id == APEI_ERST_INVALID_RECORD_ID)
- goto out;
- rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
- /* someone else has cleared the record, try next one */
- if (rc == -ENOENT)
- goto retry;
- else if (rc < 0)
- goto out;
- /* try to skip other type records in storage */
- else if (rc != sizeof(rcd) ||
- uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
- goto retry;
- memcpy(m, &rcd.mce, sizeof(*m));
- rc = sizeof(*m);
-out:
- erst_get_record_id_end();
-
- return rc;
-}
-
-/* Check whether there is record in ERST */
-int apei_check_mce(void)
-{
- return erst_get_record_count();
-}
-
-int apei_clear_mce(u64 record_id)
-{
- return erst_clear(record_id);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
deleted file mode 100644
index 5ac2d1fb28bc..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Machine check injection support.
- * Copyright 2008 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Authors:
- * Andi Kleen
- * Ying Huang
- */
-#include <linux/uaccess.h>
-#include <linux/module.h>
-#include <linux/timer.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/preempt.h>
-#include <linux/smp.h>
-#include <linux/notifier.h>
-#include <linux/kdebug.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <asm/mce.h>
-#include <asm/apic.h>
-#include <asm/nmi.h>
-
-/* Update fake mce registers on current CPU. */
-static void inject_mce(struct mce *m)
-{
- struct mce *i = &per_cpu(injectm, m->extcpu);
-
- /* Make sure no one reads partially written injectm */
- i->finished = 0;
- mb();
- m->finished = 0;
- /* First set the fields after finished */
- i->extcpu = m->extcpu;
- mb();
- /* Now write record in order, finished last (except above) */
- memcpy(i, m, sizeof(struct mce));
- /* Finally activate it */
- mb();
- i->finished = 1;
-}
-
-static void raise_poll(struct mce *m)
-{
- unsigned long flags;
- mce_banks_t b;
-
- memset(&b, 0xff, sizeof(mce_banks_t));
- local_irq_save(flags);
- machine_check_poll(0, &b);
- local_irq_restore(flags);
- m->finished = 0;
-}
-
-static void raise_exception(struct mce *m, struct pt_regs *pregs)
-{
- struct pt_regs regs;
- unsigned long flags;
-
- if (!pregs) {
- memset(&regs, 0, sizeof(struct pt_regs));
- regs.ip = m->ip;
- regs.cs = m->cs;
- pregs = &regs;
- }
- /* in mcheck exeception handler, irq will be disabled */
- local_irq_save(flags);
- do_machine_check(pregs, 0);
- local_irq_restore(flags);
- m->finished = 0;
-}
-
-static cpumask_var_t mce_inject_cpumask;
-static DEFINE_MUTEX(mce_inject_mutex);
-
-static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
-{
- int cpu = smp_processor_id();
- struct mce *m = &__get_cpu_var(injectm);
- if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
- return NMI_DONE;
- cpumask_clear_cpu(cpu, mce_inject_cpumask);
- if (m->inject_flags & MCJ_EXCEPTION)
- raise_exception(m, regs);
- else if (m->status)
- raise_poll(m);
- return NMI_HANDLED;
-}
-
-static void mce_irq_ipi(void *info)
-{
- int cpu = smp_processor_id();
- struct mce *m = &__get_cpu_var(injectm);
-
- if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
- m->inject_flags & MCJ_EXCEPTION) {
- cpumask_clear_cpu(cpu, mce_inject_cpumask);
- raise_exception(m, NULL);
- }
-}
-
-/* Inject mce on current CPU */
-static int raise_local(void)
-{
- struct mce *m = &__get_cpu_var(injectm);
- int context = MCJ_CTX(m->inject_flags);
- int ret = 0;
- int cpu = m->extcpu;
-
- if (m->inject_flags & MCJ_EXCEPTION) {
- printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
- switch (context) {
- case MCJ_CTX_IRQ:
- /*
- * Could do more to fake interrupts like
- * calling irq_enter, but the necessary
- * machinery isn't exported currently.
- */
- /*FALL THROUGH*/
- case MCJ_CTX_PROCESS:
- raise_exception(m, NULL);
- break;
- default:
- printk(KERN_INFO "Invalid MCE context\n");
- ret = -EINVAL;
- }
- printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
- } else if (m->status) {
- printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
- raise_poll(m);
- mce_notify_irq();
- printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
- } else
- m->finished = 0;
-
- return ret;
-}
-
-static void raise_mce(struct mce *m)
-{
- int context = MCJ_CTX(m->inject_flags);
-
- inject_mce(m);
-
- if (context == MCJ_CTX_RANDOM)
- return;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
- unsigned long start;
- int cpu;
-
- get_online_cpus();
- cpumask_copy(mce_inject_cpumask, cpu_online_mask);
- cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
- for_each_online_cpu(cpu) {
- struct mce *mcpu = &per_cpu(injectm, cpu);
- if (!mcpu->finished ||
- MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
- cpumask_clear_cpu(cpu, mce_inject_cpumask);
- }
- if (!cpumask_empty(mce_inject_cpumask)) {
- if (m->inject_flags & MCJ_IRQ_BROADCAST) {
- /*
- * don't wait because mce_irq_ipi is necessary
- * to be sync with following raise_local
- */
- preempt_disable();
- smp_call_function_many(mce_inject_cpumask,
- mce_irq_ipi, NULL, 0);
- preempt_enable();
- } else if (m->inject_flags & MCJ_NMI_BROADCAST)
- apic->send_IPI_mask(mce_inject_cpumask,
- NMI_VECTOR);
- }
- start = jiffies;
- while (!cpumask_empty(mce_inject_cpumask)) {
- if (!time_before(jiffies, start + 2*HZ)) {
- printk(KERN_ERR
- "Timeout waiting for mce inject %lx\n",
- *cpumask_bits(mce_inject_cpumask));
- break;
- }
- cpu_relax();
- }
- raise_local();
- put_cpu();
- put_online_cpus();
- } else
-#endif
- {
- preempt_disable();
- raise_local();
- preempt_enable();
- }
-}
-
-/* Error injection interface */
-static ssize_t mce_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- struct mce m;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /*
- * There are some cases where real MSR reads could slip
- * through.
- */
- if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
- return -EIO;
-
- if ((unsigned long)usize > sizeof(struct mce))
- usize = sizeof(struct mce);
- if (copy_from_user(&m, ubuf, usize))
- return -EFAULT;
-
- if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
- return -EINVAL;
-
- /*
- * Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
- */
- schedule_timeout(2);
-
- mutex_lock(&mce_inject_mutex);
- raise_mce(&m);
- mutex_unlock(&mce_inject_mutex);
- return usize;
-}
-
-static int inject_init(void)
-{
- if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
- return -ENOMEM;
- printk(KERN_INFO "Machine check injector initialized\n");
- register_mce_write_callback(mce_write);
- register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
- "mce_notify");
- return 0;
-}
-
-module_init(inject_init);
-/*
- * Cannot tolerate unloading currently because we cannot
- * guarantee all openers of mce_chrdev will get a reference to us.
- */
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
deleted file mode 100644
index 5b7d4fa5d3b7..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#include <linux/device.h>
-#include <asm/mce.h>
-
-enum severity_level {
- MCE_NO_SEVERITY,
- MCE_KEEP_SEVERITY,
- MCE_SOME_SEVERITY,
- MCE_AO_SEVERITY,
- MCE_UC_SEVERITY,
- MCE_AR_SEVERITY,
- MCE_PANIC_SEVERITY,
-};
-
-#define ATTR_LEN 16
-
-/* One object for each MCE bank, shared by all CPUs */
-struct mce_bank {
- u64 ctl; /* subevents to enable */
- unsigned char init; /* initialise bank? */
- struct device_attribute attr; /* device attribute */
- char attrname[ATTR_LEN]; /* attribute name */
-};
-
-int mce_severity(struct mce *a, int tolerant, char **msg);
-struct dentry *mce_get_debugfs_dir(void);
-
-extern struct mce_bank *mce_banks;
-
-#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
-#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
-#endif
-
-void mce_timer_kick(unsigned long interval);
-
-#ifdef CONFIG_ACPI_APEI
-int apei_write_mce(struct mce *m);
-ssize_t apei_read_mce(struct mce *m, u64 *record_id);
-int apei_check_mce(void);
-int apei_clear_mce(u64 record_id);
-#else
-static inline int apei_write_mce(struct mce *m)
-{
- return -EINVAL;
-}
-static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
-{
- return 0;
-}
-static inline int apei_check_mce(void)
-{
- return 0;
-}
-static inline int apei_clear_mce(u64 record_id)
-{
- return -EINVAL;
-}
-#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
deleted file mode 100644
index e2703520d120..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * MCE grading rules.
- * Copyright 2008, 2009 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- *
- * Author: Andi Kleen
- */
-#include <linux/kernel.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-/*
- * Grade an mce by severity. In general the most severe ones are processed
- * first. Since there are quite a lot of combinations test the bits in a
- * table-driven way. The rules are simply processed in order, first
- * match wins.
- *
- * Note this is only used for machine check exceptions, the corrected
- * errors use much simpler rules. The exceptions still check for the corrected
- * errors, but only to leave them alone for the CMCI handler (except for
- * panic situations)
- */
-
-enum context { IN_KERNEL = 1, IN_USER = 2 };
-enum ser { SER_REQUIRED = 1, NO_SER = 2 };
-
-static struct severity {
- u64 mask;
- u64 result;
- unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
- unsigned char ser;
- unsigned char context;
- unsigned char covered;
- char *msg;
-} severities[] = {
-#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define BITCLR(x) .mask = x, .result = 0
-#define BITSET(x) .mask = x, .result = x
-#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
-#define MASK(x, y) .mask = x, .result = y
-#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
-#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
-#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
-
- MCESEV(
- NO, "Invalid",
- BITCLR(MCI_STATUS_VAL)
- ),
- MCESEV(
- NO, "Not enabled",
- BITCLR(MCI_STATUS_EN)
- ),
- MCESEV(
- PANIC, "Processor context corrupt",
- BITSET(MCI_STATUS_PCC)
- ),
- /* When MCIP is not set something is very confused */
- MCESEV(
- PANIC, "MCIP not set in MCA handler",
- MCGMASK(MCG_STATUS_MCIP, 0)
- ),
- /* Neither return not error IP -- no chance to recover -> PANIC */
- MCESEV(
- PANIC, "Neither restart nor error IP",
- MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
- ),
- MCESEV(
- PANIC, "In kernel and no restart IP",
- KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
- ),
- MCESEV(
- KEEP, "Corrected error",
- NOSER, BITCLR(MCI_STATUS_UC)
- ),
-
- /* ignore OVER for UCNA */
- MCESEV(
- KEEP, "Uncorrected no action required",
- SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
- ),
- MCESEV(
- PANIC, "Illegal combination (UCNA with AR=1)",
- SER,
- MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
- ),
- MCESEV(
- KEEP, "Non signalled machine check",
- SER, BITCLR(MCI_STATUS_S)
- ),
-
- MCESEV(
- PANIC, "Action required with lost events",
- SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
- ),
-
- /* known AR MCACODs: */
-#ifdef CONFIG_MEMORY_FAILURE
- MCESEV(
- KEEP, "Action required but unaffected thread is continuable",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR),
- MCGMASK(MCG_STATUS_RIPV, MCG_STATUS_RIPV)
- ),
- MCESEV(
- AR, "Action required: data load error in a user process",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
- USER
- ),
- MCESEV(
- AR, "Action required: instruction fetch error in a user process",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
- USER
- ),
-#endif
- MCESEV(
- PANIC, "Action required: unknown MCACOD",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
- ),
-
- /* known AO MCACODs: */
- MCESEV(
- AO, "Action optional: memory scrubbing error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
- ),
- MCESEV(
- AO, "Action optional: last level cache writeback error",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
- ),
- MCESEV(
- SOME, "Action optional: unknown MCACOD",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
- ),
- MCESEV(
- SOME, "Action optional with lost events",
- SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
- ),
-
- MCESEV(
- PANIC, "Overflowed uncorrected",
- BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
- ),
- MCESEV(
- UC, "Uncorrected",
- BITSET(MCI_STATUS_UC)
- ),
- MCESEV(
- SOME, "No match",
- BITSET(0)
- ) /* always matches. keep at end */
-};
-
-/*
- * If mcgstatus indicated that ip/cs on the stack were
- * no good, then "m->cs" will be zero and we will have
- * to assume the worst case (IN_KERNEL) as we actually
- * have no idea what we were executing when the machine
- * check hit.
- * If we do have a good "m->cs" (or a faked one in the
- * case we were executing in VM86 mode) we can use it to
- * distinguish an exception taken in user from from one
- * taken in the kernel.
- */
-static int error_context(struct mce *m)
-{
- return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
-}
-
-int mce_severity(struct mce *m, int tolerant, char **msg)
-{
- enum context ctx = error_context(m);
- struct severity *s;
-
- for (s = severities;; s++) {
- if ((m->status & s->mask) != s->result)
- continue;
- if ((m->mcgstatus & s->mcgmask) != s->mcgres)
- continue;
- if (s->ser == SER_REQUIRED && !mca_cfg.ser)
- continue;
- if (s->ser == NO_SER && mca_cfg.ser)
- continue;
- if (s->context && ctx != s->context)
- continue;
- if (msg)
- *msg = s->msg;
- s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
- return MCE_PANIC_SEVERITY;
- }
- return s->sev;
- }
-}
-
-#ifdef CONFIG_DEBUG_FS
-static void *s_start(struct seq_file *f, loff_t *pos)
-{
- if (*pos >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void *s_next(struct seq_file *f, void *data, loff_t *pos)
-{
- if (++(*pos) >= ARRAY_SIZE(severities))
- return NULL;
- return &severities[*pos];
-}
-
-static void s_stop(struct seq_file *f, void *data)
-{
-}
-
-static int s_show(struct seq_file *f, void *data)
-{
- struct severity *ser = data;
- seq_printf(f, "%d\t%s\n", ser->covered, ser->msg);
- return 0;
-}
-
-static const struct seq_operations severities_seq_ops = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
-static int severities_coverage_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &severities_seq_ops);
-}
-
-static ssize_t severities_coverage_write(struct file *file,
- const char __user *ubuf,
- size_t count, loff_t *ppos)
-{
- int i;
- for (i = 0; i < ARRAY_SIZE(severities); i++)
- severities[i].covered = 0;
- return count;
-}
-
-static const struct file_operations severities_coverage_fops = {
- .open = severities_coverage_open,
- .release = seq_release,
- .read = seq_read,
- .write = severities_coverage_write,
- .llseek = seq_lseek,
-};
-
-static int __init severities_debugfs_init(void)
-{
- struct dentry *dmce, *fsev;
-
- dmce = mce_get_debugfs_dir();
- if (!dmce)
- goto err_out;
-
- fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
- &severities_coverage_fops);
- if (!fsev)
- goto err_out;
-
- return 0;
-
-err_out:
- return -ENOMEM;
-}
-late_initcall(severities_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
deleted file mode 100644
index bf49cdbb010f..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ /dev/null
@@ -1,2485 +0,0 @@
-/*
- * Machine check handler.
- *
- * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
- * Rest from unknown author(s).
- * 2004 Andi Kleen. Rewrote most of it.
- * Copyright 2008 Intel Corporation
- * Author: Andi Kleen
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/thread_info.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
-#include <linux/ratelimit.h>
-#include <linux/kallsyms.h>
-#include <linux/rcupdate.h>
-#include <linux/kobject.h>
-#include <linux/uaccess.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/string.h>
-#include <linux/device.h>
-#include <linux/syscore_ops.h>
-#include <linux/delay.h>
-#include <linux/ctype.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kmod.h>
-#include <linux/poll.h>
-#include <linux/nmi.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/debugfs.h>
-#include <linux/irq_work.h>
-#include <linux/export.h>
-
-#include <asm/processor.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#include "mce-internal.h"
-
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-#define rcu_dereference_check_mce(p) \
- rcu_dereference_index_check((p), \
- rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_chrdev_read_mutex))
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/mce.h>
-
-#define SPINUNIT 100 /* 100ns */
-
-atomic_t mce_entry;
-
-DEFINE_PER_CPU(unsigned, mce_exception_count);
-
-struct mce_bank *mce_banks __read_mostly;
-
-struct mca_config mca_cfg __read_mostly = {
- .bootlog = -1,
- /*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
- .tolerant = 1,
- .monarch_timeout = -1
-};
-
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
-static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
-
-/*
- * MCA banks polled by the period polling timer for corrected events.
- * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
- */
-DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
- [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
-};
-
-static DEFINE_PER_CPU(struct work_struct, mce_work);
-
-static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
-
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
-{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
- rdtscll(m->tsc);
- /* We hope get_seconds stays lockless */
- m->time = get_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).phys_proc_id;
- m->apicid = cpu_data(m->extcpu).initial_apicid;
- rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
-}
-
-DEFINE_PER_CPU(struct mce, injectm);
-EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
-{
- unsigned next, entry;
- int ret = 0;
-
- /* Emit the trace record: */
- trace_mce_record(mce);
-
- ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
- if (ret == NOTIFY_STOP)
- return;
-
- mce->finished = 0;
- wmb();
- for (;;) {
- entry = rcu_dereference_check_mce(mcelog.next);
- for (;;) {
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- mce->finished = 1;
- set_bit(0, &mce_need_notify);
-}
-
-static void drain_mcelog_buffer(void)
-{
- unsigned int next, i, prev = 0;
-
- next = ACCESS_ONCE(mcelog.next);
-
- do {
- struct mce *m;
-
- /* drain what was logged during boot */
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- unsigned retries = 1;
-
- m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2*retries))
- retries++;
-
- cpu_relax();
-
- if (!m->finished && retries >= 4) {
- pr_err("skipping error being logged currently!\n");
- break;
- }
- }
- smp_rmb();
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- }
-
- memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-}
-
-
-void mce_register_decode_chain(struct notifier_block *nb)
-{
- atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
- drain_mcelog_buffer();
-}
-EXPORT_SYMBOL_GPL(mce_register_decode_chain);
-
-void mce_unregister_decode_chain(struct notifier_block *nb)
-{
- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
-}
-EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-
-static void print_mce(struct mce *m)
-{
- int ret = 0;
-
- pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
-
- if (m->ip) {
- pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
- !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
-
- if (m->cs == __KERNEL_CS)
- print_symbol("{%s}", m->ip);
- pr_cont("\n");
- }
-
- pr_emerg(HW_ERR "TSC %llx ", m->tsc);
- if (m->addr)
- pr_cont("ADDR %llx ", m->addr);
- if (m->misc)
- pr_cont("MISC %llx ", m->misc);
-
- pr_cont("\n");
- /*
- * Note this output is parsed by external tools and old fields
- * should not be changed.
- */
- pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
- cpu_data(m->extcpu).microcode);
-
- /*
- * Print out human-readable details about the MCE error,
- * (if the CPU has an implementation for that)
- */
- ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- if (ret == NOTIFY_STOP)
- return;
-
- pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
-}
-
-#define PANIC_TIMEOUT 5 /* 5 seconds */
-
-static atomic_t mce_paniced;
-
-static int fake_panic;
-static atomic_t mce_fake_paniced;
-
-/* Panic in progress. Enable interrupts and wait for final IPI */
-static void wait_for_panic(void)
-{
- long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
-
- preempt_disable();
- local_irq_enable();
- while (timeout-- > 0)
- udelay(1);
- if (panic_timeout == 0)
- panic_timeout = mca_cfg.panic_timeout;
- panic("Panicing machine check CPU died");
-}
-
-static void mce_panic(char *msg, struct mce *final, char *exp)
-{
- int i, apei_err = 0;
-
- if (!fake_panic) {
- /*
- * Make sure only one CPU runs in machine check panic
- */
- if (atomic_inc_return(&mce_paniced) > 1)
- wait_for_panic();
- barrier();
-
- bust_spinlocks(1);
- console_verbose();
- } else {
- /* Don't log too much for fake panic */
- if (atomic_inc_return(&mce_fake_paniced) > 1)
- return;
- }
- /* First print corrected ones that are still unlogged */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- /* Now print uncorrected but with the final one last */
- for (i = 0; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
- if (!(m->status & MCI_STATUS_VAL))
- continue;
- if (!(m->status & MCI_STATUS_UC))
- continue;
- if (!final || memcmp(m, final, sizeof(struct mce))) {
- print_mce(m);
- if (!apei_err)
- apei_err = apei_write_mce(m);
- }
- }
- if (final) {
- print_mce(final);
- if (!apei_err)
- apei_err = apei_write_mce(final);
- }
- if (cpu_missing)
- pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
- if (exp)
- pr_emerg(HW_ERR "Machine check: %s\n", exp);
- if (!fake_panic) {
- if (panic_timeout == 0)
- panic_timeout = mca_cfg.panic_timeout;
- panic(msg);
- } else
- pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
-}
-
-/* Support code for software error injection */
-
-static int msr_to_offset(u32 msr)
-{
- unsigned bank = __this_cpu_read(injectm.bank);
-
- if (msr == mca_cfg.rip_msr)
- return offsetof(struct mce, ip);
- if (msr == MSR_IA32_MCx_STATUS(bank))
- return offsetof(struct mce, status);
- if (msr == MSR_IA32_MCx_ADDR(bank))
- return offsetof(struct mce, addr);
- if (msr == MSR_IA32_MCx_MISC(bank))
- return offsetof(struct mce, misc);
- if (msr == MSR_IA32_MCG_STATUS)
- return offsetof(struct mce, mcgstatus);
- return -1;
-}
-
-/* MSR access wrappers used for error injection */
-static u64 mce_rdmsrl(u32 msr)
-{
- u64 v;
-
- if (__this_cpu_read(injectm.finished)) {
- int offset = msr_to_offset(msr);
-
- if (offset < 0)
- return 0;
- return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
- }
-
- if (rdmsrl_safe(msr, &v)) {
- WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
- /*
- * Return zero in case the access faulted. This should
- * not happen normally but can happen if the CPU does
- * something weird, or if the code is buggy.
- */
- v = 0;
- }
-
- return v;
-}
-
-static void mce_wrmsrl(u32 msr, u64 v)
-{
- if (__this_cpu_read(injectm.finished)) {
- int offset = msr_to_offset(msr);
-
- if (offset >= 0)
- *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
- return;
- }
- wrmsrl(msr, v);
-}
-
-/*
- * Collect all global (w.r.t. this processor) status about this machine
- * check into our "mce" struct so that we can use it later to assess
- * the severity of the problem as we read per-bank specific details.
- */
-static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
-{
- mce_setup(m);
-
- m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- if (regs) {
- /*
- * Get the address of the instruction at the time of
- * the machine check error.
- */
- if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
- m->ip = regs->ip;
- m->cs = regs->cs;
-
- /*
- * When in VM86 mode make the cs look like ring 3
- * always. This is a lie, but it's better than passing
- * the additional vm86 bit around everywhere.
- */
- if (v8086_mode(regs))
- m->cs |= 3;
- }
- /* Use accurate RIP reporting if available. */
- if (mca_cfg.rip_msr)
- m->ip = mce_rdmsrl(mca_cfg.rip_msr);
- }
-}
-
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16 /* we use one entry less */
-
-struct mce_ring {
- unsigned short start;
- unsigned short end;
- unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
-
- return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
- struct mce_ring *r;
- int ret = 0;
-
- *pfn = 0;
- get_cpu();
- r = &__get_cpu_var(mce_ring);
- if (r->start == r->end)
- goto out;
- *pfn = r->ring[r->start];
- r->start = (r->start + 1) % MCE_RING_SIZE;
- ret = 1;
-out:
- put_cpu();
- return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
- unsigned next;
-
- next = (r->end + 1) % MCE_RING_SIZE;
- if (next == r->start)
- return -1;
- r->ring[r->end] = pfn;
- wmb();
- r->end = next;
- return 0;
-}
-
-int mce_available(struct cpuinfo_x86 *c)
-{
- if (mca_cfg.disabled)
- return 0;
- return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
-}
-
-static void mce_schedule_work(void)
-{
- if (!mce_ring_empty())
- schedule_work(&__get_cpu_var(mce_work));
-}
-
-DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
-static void mce_irq_work_cb(struct irq_work *entry)
-{
- mce_notify_irq();
- mce_schedule_work();
-}
-
-static void mce_report_event(struct pt_regs *regs)
-{
- if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
- mce_notify_irq();
- /*
- * Triggering the work queue here is just an insurance
- * policy in case the syscall exit notify handler
- * doesn't run soon enough or ends up running on the
- * wrong CPU (can happen when audit sleeps)
- */
- mce_schedule_work();
- return;
- }
-
- irq_work_queue(&__get_cpu_var(mce_irq_work));
-}
-
-/*
- * Read ADDR and MISC registers.
- */
-static void mce_read_aux(struct mce *m, int i)
-{
- if (m->status & MCI_STATUS_MISCV)
- m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m->status & MCI_STATUS_ADDRV) {
- m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
-
- /*
- * Mask the reported address by the reported granularity.
- */
- if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
- u8 shift = MCI_MISC_ADDR_LSB(m->misc);
- m->addr >>= shift;
- m->addr <<= shift;
- }
- }
-}
-
-DEFINE_PER_CPU(unsigned, mce_poll_count);
-
-/*
- * Poll for corrected events or events that happened before reset.
- * Those are just logged through /dev/mcelog.
- *
- * This is executed in standard interrupt context.
- *
- * Note: spec recommends to panic for fatal unsignalled
- * errors here. However this would be quite problematic --
- * we would need to reimplement the Monarch handling and
- * it would mess up the exclusion between exception handler
- * and poll hander -- * so we skip this for now.
- * These cases should not happen anyways, or only when the CPU
- * is already totally * confused. In this case it's likely it will
- * not fully execute the machine check handler either.
- */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
-{
- struct mce m;
- int i;
-
- this_cpu_inc(mce_poll_count);
-
- mce_gather_info(&m, NULL);
-
- for (i = 0; i < mca_cfg.banks; i++) {
- if (!mce_banks[i].ctl || !test_bit(i, *b))
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
- m.tsc = 0;
-
- barrier();
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (!(m.status & MCI_STATUS_VAL))
- continue;
-
- /*
- * Uncorrected or signalled events are handled by the exception
- * handler when it is enabled, so don't process those here.
- *
- * TBD do the same check for MCI_STATUS_EN here?
- */
- if (!(flags & MCP_UC) &&
- (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
- continue;
-
- mce_read_aux(&m, i);
-
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
- /*
- * Don't get the IP here because it's unlikely to
- * have anything to do with the actual error location.
- */
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
- mce_log(&m);
-
- /*
- * Clear state for this bank.
- */
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-
- /*
- * Don't clear MCG_STATUS here because it's only defined for
- * exceptions.
- */
-
- sync_core();
-}
-EXPORT_SYMBOL_GPL(machine_check_poll);
-
-/*
- * Do a quick check if any of the events requires a panic.
- * This decides if we keep the events around or clear them.
- */
-static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
- struct pt_regs *regs)
-{
- int i, ret = 0;
-
- for (i = 0; i < mca_cfg.banks; i++) {
- m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (m->status & MCI_STATUS_VAL) {
- __set_bit(i, validp);
- if (quirk_no_way_out)
- quirk_no_way_out(i, m, regs);
- }
- if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
- ret = 1;
- }
- return ret;
-}
-
-/*
- * Variable to establish order between CPUs while scanning.
- * Each CPU spins initially until executing is equal its number.
- */
-static atomic_t mce_executing;
-
-/*
- * Defines order of CPUs on entry. First CPU becomes Monarch.
- */
-static atomic_t mce_callin;
-
-/*
- * Check if a timeout waiting for other CPUs happened.
- */
-static int mce_timed_out(u64 *t)
-{
- /*
- * The others already did panic for some reason.
- * Bail out like in a timeout.
- * rmb() to tell the compiler that system_state
- * might have been modified by someone else.
- */
- rmb();
- if (atomic_read(&mce_paniced))
- wait_for_panic();
- if (!mca_cfg.monarch_timeout)
- goto out;
- if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (mca_cfg.tolerant < 1)
- mce_panic("Timeout synchronizing machine check over CPUs",
- NULL, NULL);
- cpu_missing = 1;
- return 1;
- }
- *t -= SPINUNIT;
-out:
- touch_nmi_watchdog();
- return 0;
-}
-
-/*
- * The Monarch's reign. The Monarch is the CPU who entered
- * the machine check handler first. It waits for the others to
- * raise the exception too and then grades them. When any
- * error is fatal panic. Only then let the others continue.
- *
- * The other CPUs entering the MCE handler will be controlled by the
- * Monarch. They are called Subjects.
- *
- * This way we prevent any potential data corruption in a unrecoverable case
- * and also makes sure always all CPU's errors are examined.
- *
- * Also this detects the case of a machine check event coming from outer
- * space (not detected by any CPUs) In this case some external agent wants
- * us to shut down, so panic too.
- *
- * The other CPUs might still decide to panic if the handler happens
- * in a unrecoverable place, but in this case the system is in a semi-stable
- * state and won't corrupt anything by itself. It's ok to let the others
- * continue for a bit first.
- *
- * All the spin loops have timeouts; when a timeout happens a CPU
- * typically elects itself to be Monarch.
- */
-static void mce_reign(void)
-{
- int cpu;
- struct mce *m = NULL;
- int global_worst = 0;
- char *msg = NULL;
- char *nmsg = NULL;
-
- /*
- * This CPU is the Monarch and the other CPUs have run
- * through their handlers.
- * Grade the severity of the errors of all the CPUs.
- */
- for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu),
- mca_cfg.tolerant,
- &nmsg);
- if (severity > global_worst) {
- msg = nmsg;
- global_worst = severity;
- m = &per_cpu(mces_seen, cpu);
- }
- }
-
- /*
- * Cannot recover? Panic here then.
- * This dumps all the mces in the log buffer and stops the
- * other CPUs.
- */
- if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
-
- /*
- * For UC somewhere we let the CPU who detects it handle it.
- * Also must let continue the others, otherwise the handling
- * CPU could deadlock on a lock.
- */
-
- /*
- * No machine check event found. Must be some external
- * source or one CPU is hung. Panic.
- */
- if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
-
- /*
- * Now clear all the mces_seen so that they don't reappear on
- * the next mce.
- */
- for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
-}
-
-static atomic_t global_nwo;
-
-/*
- * Start of Monarch synchronization. This waits until all CPUs have
- * entered the exception handler and then determines if any of them
- * saw a fatal event that requires panic. Then it executes them
- * in the entry order.
- * TBD double check parallel CPU hotunplug
- */
-static int mce_start(int *no_way_out)
-{
- int order;
- int cpus = num_online_cpus();
- u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- return -1;
-
- atomic_add(*no_way_out, &global_nwo);
- /*
- * global_nwo should be updated before mce_callin
- */
- smp_wmb();
- order = atomic_inc_return(&mce_callin);
-
- /*
- * Wait for everyone.
- */
- while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
-
- /*
- * mce_callin should be read before global_nwo
- */
- smp_rmb();
-
- if (order == 1) {
- /*
- * Monarch: Starts executing now, the others wait.
- */
- atomic_set(&mce_executing, 1);
- } else {
- /*
- * Subject: Now start the scanning loop one by one in
- * the original callin order.
- * This way when there are any shared banks it will be
- * only seen by one CPU before cleared, avoiding duplicates.
- */
- while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout)) {
- atomic_set(&global_nwo, 0);
- return -1;
- }
- ndelay(SPINUNIT);
- }
- }
-
- /*
- * Cache the global no_way_out state.
- */
- *no_way_out = atomic_read(&global_nwo);
-
- return order;
-}
-
-/*
- * Synchronize between CPUs after main scanning loop.
- * This invokes the bulk of the Monarch processing.
- */
-static int mce_end(int order)
-{
- int ret = -1;
- u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
-
- if (!timeout)
- goto reset;
- if (order < 0)
- goto reset;
-
- /*
- * Allow others to run.
- */
- atomic_inc(&mce_executing);
-
- if (order == 1) {
- /* CHECKME: Can this race with a parallel hotplug? */
- int cpus = num_online_cpus();
-
- /*
- * Monarch: Wait for everyone to go through their scanning
- * loops.
- */
- while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- mce_reign();
- barrier();
- ret = 0;
- } else {
- /*
- * Subject: Wait for Monarch to finish.
- */
- while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout))
- goto reset;
- ndelay(SPINUNIT);
- }
-
- /*
- * Don't reset anything. That's done by the Monarch.
- */
- return 0;
- }
-
- /*
- * Reset all global state.
- */
-reset:
- atomic_set(&global_nwo, 0);
- atomic_set(&mce_callin, 0);
- barrier();
-
- /*
- * Let others run again.
- */
- atomic_set(&mce_executing, 0);
- return ret;
-}
-
-/*
- * Check if the address reported by the CPU is in a format we can parse.
- * It would be possible to add code for most other cases, but all would
- * be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
- */
-static int mce_usable_address(struct mce *m)
-{
- if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
- return 0;
- if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
- return 0;
- if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
- return 0;
- return 1;
-}
-
-static void mce_clear_state(unsigned long *toclear)
-{
- int i;
-
- for (i = 0; i < mca_cfg.banks; i++) {
- if (test_bit(i, toclear))
- mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-}
-
-/*
- * Need to save faulting physical address associated with a process
- * in the machine check handler some place where we can grab it back
- * later in mce_notify_process()
- */
-#define MCE_INFO_MAX 16
-
-struct mce_info {
- atomic_t inuse;
- struct task_struct *t;
- __u64 paddr;
- int restartable;
-} mce_info[MCE_INFO_MAX];
-
-static void mce_save_info(__u64 addr, int c)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
- if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
- mi->t = current;
- mi->paddr = addr;
- mi->restartable = c;
- return;
- }
- }
-
- mce_panic("Too many concurrent recoverable errors", NULL, NULL);
-}
-
-static struct mce_info *mce_find_info(void)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
- if (atomic_read(&mi->inuse) && mi->t == current)
- return mi;
- return NULL;
-}
-
-static void mce_clear_info(struct mce_info *mi)
-{
- atomic_set(&mi->inuse, 0);
-}
-
-/*
- * The actual machine check handler. This only handles real
- * exceptions when something got corrupted coming in through int 18.
- *
- * This is executed in NMI context not subject to normal locking rules. This
- * implies that most kernel services cannot be safely used. Don't even
- * think about putting a printk in there!
- *
- * On Intel systems this is entered on all CPUs in parallel through
- * MCE broadcast. However some CPUs might be broken beyond repair,
- * so be always careful when synchronizing with others.
- */
-void do_machine_check(struct pt_regs *regs, long error_code)
-{
- struct mca_config *cfg = &mca_cfg;
- struct mce m, *final;
- int i;
- int worst = 0;
- int severity;
- /*
- * Establish sequential order between the CPUs entering the machine
- * check handler.
- */
- int order;
- /*
- * If no_way_out gets set, there is no safe way to recover from this
- * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
- */
- int no_way_out = 0;
- /*
- * If kill_it gets set, there might be a way to recover from this
- * error.
- */
- int kill_it = 0;
- DECLARE_BITMAP(toclear, MAX_NR_BANKS);
- DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
- char *msg = "Unknown";
-
- atomic_inc(&mce_entry);
-
- this_cpu_inc(mce_exception_count);
-
- if (!cfg->banks)
- goto out;
-
- mce_gather_info(&m, regs);
-
- final = &__get_cpu_var(mces_seen);
- *final = m;
-
- memset(valid_banks, 0, sizeof(valid_banks));
- no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
-
- barrier();
-
- /*
- * When no restart IP might need to kill or panic.
- * Assume the worst for now, but if we find the
- * severity is MCE_AR_SEVERITY we have other options.
- */
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
-
- /*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
- */
- order = mce_start(&no_way_out);
- for (i = 0; i < cfg->banks; i++) {
- __clear_bit(i, toclear);
- if (!test_bit(i, valid_banks))
- continue;
- if (!mce_banks[i].ctl)
- continue;
-
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
-
- m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if ((m.status & MCI_STATUS_VAL) == 0)
- continue;
-
- /*
- * Non uncorrected or non signaled errors are handled by
- * machine_check_poll. Leave them alone, unless this panics.
- */
- if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
- !no_way_out)
- continue;
-
- /*
- * Set taint even when machine check was not enabled.
- */
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
- severity = mce_severity(&m, cfg->tolerant, NULL);
-
- /*
- * When machine check was for corrected handler don't touch,
- * unless we're panicing.
- */
- if (severity == MCE_KEEP_SEVERITY && !no_way_out)
- continue;
- __set_bit(i, toclear);
- if (severity == MCE_NO_SEVERITY) {
- /*
- * Machine check event was not enabled. Clear, but
- * ignore.
- */
- continue;
- }
-
- mce_read_aux(&m, i);
-
- /*
- * Action optional error. Queue address for later processing.
- * When the ring overflows we just ignore the AO error.
- * RED-PEN add some logging mechanism when
- * usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
- */
- if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
- mce_ring_add(m.addr >> PAGE_SHIFT);
-
- mce_log(&m);
-
- if (severity > worst) {
- *final = m;
- worst = severity;
- }
- }
-
- /* mce_clear_state will clear *final, save locally for use later */
- m = *final;
-
- if (!no_way_out)
- mce_clear_state(toclear);
-
- /*
- * Do most of the synchronization with other CPUs.
- * When there's any problem use only local no_way_out state.
- */
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
-
- /*
- * At insane "tolerant" levels we take no action. Otherwise
- * we only die if we have no other choice. For less serious
- * issues we try to recover, or limit damage to the current
- * process.
- */
- if (cfg->tolerant < 3) {
- if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
- if (worst == MCE_AR_SEVERITY) {
- /* schedule action before return to userland */
- mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
- set_thread_flag(TIF_MCE_NOTIFY);
- } else if (kill_it) {
- force_sig(SIGBUS, current);
- }
- }
-
- if (worst > 0)
- mce_report_event(regs);
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
-out:
- atomic_dec(&mce_entry);
- sync_core();
-}
-EXPORT_SYMBOL_GPL(do_machine_check);
-
-#ifndef CONFIG_MEMORY_FAILURE
-int memory_failure(unsigned long pfn, int vector, int flags)
-{
- /* mce_severity() should not hand us an ACTION_REQUIRED error */
- BUG_ON(flags & MF_ACTION_REQUIRED);
- pr_err("Uncorrected memory error in page 0x%lx ignored\n"
- "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
- pfn);
-
- return 0;
-}
-#endif
-
-/*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
- * Attempt possible recovery such as calling the high level VM handler to
- * process any corrupted pages, and kill/signal current process if required.
- * Action required errors are handled here.
- */
-void mce_notify_process(void)
-{
- unsigned long pfn;
- struct mce_info *mi = mce_find_info();
- int flags = MF_ACTION_REQUIRED;
-
- if (!mi)
- mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
- pfn = mi->paddr >> PAGE_SHIFT;
-
- clear_thread_flag(TIF_MCE_NOTIFY);
-
- pr_err("Uncorrected hardware memory error in user-access at %llx",
- mi->paddr);
- /*
- * We must call memory_failure() here even if the current process is
- * doomed. We still need to mark the page as poisoned and alert any
- * other users of the page.
- */
- if (!mi->restartable)
- flags |= MF_MUST_KILL;
- if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
- pr_err("Memory error not recovered");
- force_sig(SIGBUS, current);
- }
- mce_clear_info(mi);
-}
-
-/*
- * Action optional processing happens here (picking up
- * from the list of faulting pages that do_machine_check()
- * placed into the "ring").
- */
-static void mce_process_work(struct work_struct *dummy)
-{
- unsigned long pfn;
-
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR, 0);
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
- * Periodic polling timer for "silent" machine check errors. If the
- * poller finds an MCE, poll 2x faster. When the poller finds no more
- * errors, poll 2x slower (up to check_interval seconds).
- */
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
-
-static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
-
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
- mce_adjust_timer_default;
-
-static void mce_timer_fn(unsigned long data)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- unsigned long iv;
-
- WARN_ON(smp_processor_id() != data);
-
- if (mce_available(__this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
- mce_intel_cmci_poll();
- }
-
- /*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
- */
- iv = __this_cpu_read(mce_next_interval);
- if (mce_notify_irq()) {
- iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
- iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
- __this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
-}
-
-/*
- * Ensure that the timer is firing in @interval from now.
- */
-void mce_timer_kick(unsigned long interval)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- unsigned long when = jiffies + interval;
- unsigned long iv = __this_cpu_read(mce_next_interval);
-
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
-}
-
-/* Must not be called in IRQ context where del_timer_sync() can deadlock */
-static void mce_timer_delete_all(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- del_timer_sync(&per_cpu(mce_timer, cpu));
-}
-
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
-/*
- * Notify the user(s) about new machine check events.
- * Can be called from interrupt context, but not from machine check/NMI
- * context.
- */
-int mce_notify_irq(void)
-{
- /* Not more than two messages every minute */
- static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
-
- if (test_and_clear_bit(0, &mce_need_notify)) {
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- if (mce_helper[0])
- schedule_work(&mce_trigger_work);
-
- if (__ratelimit(&ratelimit))
- pr_info(HW_ERR "Machine check events logged\n");
-
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(mce_notify_irq);
-
-static int __cpuinit __mcheck_cpu_mce_banks_init(void)
-{
- int i;
- u8 num_banks = mca_cfg.banks;
-
- mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
- if (!mce_banks)
- return -ENOMEM;
-
- for (i = 0; i < num_banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- b->ctl = -1ULL;
- b->init = 1;
- }
- return 0;
-}
-
-/*
- * Initialize Machine Checks for a CPU.
- */
-static int __cpuinit __mcheck_cpu_cap_init(void)
-{
- unsigned b;
- u64 cap;
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
-
- b = cap & MCG_BANKCNT_MASK;
- if (!mca_cfg.banks)
- pr_info("CPU supports %d MCE banks\n", b);
-
- if (b > MAX_NR_BANKS) {
- pr_warn("Using only %u machine check banks out of %u\n",
- MAX_NR_BANKS, b);
- b = MAX_NR_BANKS;
- }
-
- /* Don't support asymmetric configurations today */
- WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
- mca_cfg.banks = b;
-
- if (!mce_banks) {
- int err = __mcheck_cpu_mce_banks_init();
-
- if (err)
- return err;
- }
-
- /* Use accurate RIP reporting if available. */
- if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
-
- if (cap & MCG_SER_P)
- mca_cfg.ser = true;
-
- return 0;
-}
-
-static void __mcheck_cpu_init_generic(void)
-{
- enum mcp_flags m_fl = 0;
- mce_banks_t all_banks;
- u64 cap;
- int i;
-
- if (!mca_cfg.bootlog)
- m_fl = MCP_DONTLOG;
-
- /*
- * Log the machine checks left over from the previous reset.
- */
- bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC | m_fl, &all_banks);
-
- set_in_cr4(X86_CR4_MCE);
-
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- if (cap & MCG_CTL_P)
- wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (!b->init)
- continue;
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
- wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
- }
-}
-
-/*
- * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
- * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
- * Vol 3B Table 15-20). But this confuses both the code that determines
- * whether the machine check occurred in kernel or user mode, and also
- * the severity assessment code. Pretend that EIPV was set, and take the
- * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
- */
-static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
-{
- if (bank != 0)
- return;
- if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
- return;
- if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
- MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
- MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
- MCACOD)) !=
- (MCI_STATUS_UC|MCI_STATUS_EN|
- MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
- MCI_STATUS_AR|MCACOD_INSTR))
- return;
-
- m->mcgstatus |= MCG_STATUS_EIPV;
- m->ip = regs->ip;
- m->cs = regs->cs;
-}
-
-/* Add per CPU specific workarounds here */
-static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
-{
- struct mca_config *cfg = &mca_cfg;
-
- if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("unknown CPU type - not enabling MCE support\n");
- return -EOPNOTSUPP;
- }
-
- /* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && cfg->banks > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
- if (c->x86 <= 17 && cfg->bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- cfg->bootlog = 0;
- }
- /*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
- */
- if (c->x86 == 6 && cfg->banks > 0)
- mce_banks[0].ctl = 0;
-
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 val, hwcr;
- bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
-
- rdmsrl(MSR_K7_HWCR, hwcr);
-
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
-
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
-
- for (i = 0; i < ARRAY_SIZE(msrs); i++) {
- rdmsrl(msrs[i], val);
-
- /* CntP bit set? */
- if (val & BIT_64(62)) {
- val &= ~BIT_64(62);
- wrmsrl(msrs[i], val);
- }
- }
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
- }
-
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
-
- if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
- mce_banks[0].init = 0;
-
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
-
- /*
- * There are also broken BIOSes on some Pentium M and
- * earlier systems:
- */
- if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
- cfg->bootlog = 0;
-
- if (c->x86 == 6 && c->x86_model == 45)
- quirk_no_way_out = quirk_sandybridge_ifu;
- }
- if (cfg->monarch_timeout < 0)
- cfg->monarch_timeout = 0;
- if (cfg->bootlog != 0)
- cfg->panic_timeout = 30;
-
- return 0;
-}
-
-static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
-{
- if (c->x86 != 5)
- return 0;
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- intel_p5_mcheck_init(c);
- return 1;
- break;
- case X86_VENDOR_CENTAUR:
- winchip_mcheck_init(c);
- return 1;
- break;
- }
-
- return 0;
-}
-
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
-{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
- break;
- case X86_VENDOR_AMD:
- mce_amd_feature_init(c);
- break;
- default:
- break;
- }
-}
-
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
-{
- unsigned long iv = mce_adjust_timer(check_interval * HZ);
-
- __this_cpu_write(mce_next_interval, iv);
-
- if (mca_cfg.ignore_ce || !iv)
- return;
-
- t->expires = round_jiffies(jiffies + iv);
- add_timer_on(t, smp_processor_id());
-}
-
-static void __mcheck_cpu_init_timer(void)
-{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- unsigned int cpu = smp_processor_id();
-
- setup_timer(t, mce_timer_fn, cpu);
- mce_start_timer(cpu, t);
-}
-
-/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs *regs, long error_code)
-{
- pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
- smp_processor_id());
-}
-
-/* Call the installed machine check handler for this CPU setup. */
-void (*machine_check_vector)(struct pt_regs *, long error_code) =
- unexpected_machine_check;
-
-/*
- * Called for each booted CPU to set up machine checks.
- * Must be called with preempt off:
- */
-void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
-{
- if (mca_cfg.disabled)
- return;
-
- if (__mcheck_cpu_ancient_init(c))
- return;
-
- if (!mce_available(c))
- return;
-
- if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
- mca_cfg.disabled = true;
- return;
- }
-
- machine_check_vector = do_machine_check;
-
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(c);
- __mcheck_cpu_init_timer();
- INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
- init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
-}
-
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count; /* #times opened */
-static int mce_chrdev_open_exclu; /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- if (mce_chrdev_open_exclu ||
- (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_chrdev_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- mce_chrdev_open_exclu = 1;
- mce_chrdev_open_count++;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- mce_chrdev_open_count--;
- mce_chrdev_open_exclu = 0;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- rdtscll(cpu_tsc[smp_processor_id()]);
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- /*
- * When ERST is disabled, mce_chrdev_read() should return
- * "no record" instead of "no device."
- */
- if (rc == -ENODEV)
- return 0;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
- size_t usize, loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_chrdev_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = rcu_dereference_check_mce(mcelog.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- struct mce *m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(m, 0, sizeof(*m));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, m, sizeof(*m));
- buf += sizeof(*m);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
-
- if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
- err |= copy_to_user(buf, m, sizeof(*m));
- smp_rmb();
- buf += sizeof(*m);
- memset(m, 0, sizeof(*m));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_chrdev_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_chrdev_wait, wait);
- if (rcu_access_index(mcelog.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
- unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
-{
- mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
- return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_chrdev_open,
- .release = mce_chrdev_release,
- .read = mce_chrdev_read,
- .write = mce_chrdev_write,
- .poll = mce_chrdev_poll,
- .unlocked_ioctl = mce_chrdev_ioctl,
- .llseek = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
-/*
- * mce=off Disables machine check
- * mce=no_cmci Disables CMCI
- * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
- * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
- * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
- * monarchtimeout is how long to wait for other CPUs on machine
- * check, or 0 to not wait
- * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
- * mce=nobootlog Don't log MCEs from before booting.
- * mce=bios_cmci_threshold Don't program the CMCI threshold
- */
-static int __init mcheck_enable(char *str)
-{
- struct mca_config *cfg = &mca_cfg;
-
- if (*str == 0) {
- enable_p5_mce();
- return 1;
- }
- if (*str == '=')
- str++;
- if (!strcmp(str, "off"))
- cfg->disabled = true;
- else if (!strcmp(str, "no_cmci"))
- cfg->cmci_disabled = true;
- else if (!strcmp(str, "dont_log_ce"))
- cfg->dont_log_ce = true;
- else if (!strcmp(str, "ignore_ce"))
- cfg->ignore_ce = true;
- else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- cfg->bootlog = (str[0] == 'b');
- else if (!strcmp(str, "bios_cmci_threshold"))
- cfg->bios_cmci_threshold = true;
- else if (isdigit(str[0])) {
- get_option(&str, &(cfg->tolerant));
- if (*str == ',') {
- ++str;
- get_option(&str, &(cfg->monarch_timeout));
- }
- } else {
- pr_info("mce argument %s ignored. Please use /sys\n", str);
- return 0;
- }
- return 1;
-}
-__setup("mce", mcheck_enable);
-
-int __init mcheck_init(void)
-{
- mcheck_intel_therm_init();
-
- return 0;
-}
-
-/*
- * mce_syscore: PM support
- */
-
-/*
- * Disable machine checks on suspend and shutdown. We can't really handle
- * them later.
- */
-static int mce_disable_error_reporting(void)
-{
- int i;
-
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
- return 0;
-}
-
-static int mce_syscore_suspend(void)
-{
- return mce_disable_error_reporting();
-}
-
-static void mce_syscore_shutdown(void)
-{
- mce_disable_error_reporting();
-}
-
-/*
- * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
- * Only one CPU is active at this time, the others get re-added later using
- * CPU hotplug:
- */
-static void mce_syscore_resume(void)
-{
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
-}
-
-static struct syscore_ops mce_syscore_ops = {
- .suspend = mce_syscore_suspend,
- .shutdown = mce_syscore_shutdown,
- .resume = mce_syscore_resume,
-};
-
-/*
- * mce_device: Sysfs support
- */
-
-static void mce_cpu_restart(void *data)
-{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
- return;
- __mcheck_cpu_init_generic();
- __mcheck_cpu_init_timer();
-}
-
-/* Reinit MCEs after user configuration changes */
-static void mce_restart(void)
-{
- mce_timer_delete_all();
- on_each_cpu(mce_cpu_restart, NULL, 1);
-}
-
-/* Toggle features for corrected errors */
-static void mce_disable_cmci(void *data)
-{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
- return;
- cmci_clear();
-}
-
-static void mce_enable_ce(void *all)
-{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
- return;
- cmci_reenable();
- cmci_recheck();
- if (all)
- __mcheck_cpu_init_timer();
-}
-
-static struct bus_type mce_subsys = {
- .name = "machinecheck",
- .dev_name = "machinecheck",
-};
-
-DEFINE_PER_CPU(struct device *, mce_device);
-
-__cpuinitdata
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
-static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
-{
- return container_of(attr, struct mce_bank, attr);
-}
-
-static ssize_t show_bank(struct device *s, struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
-}
-
-static ssize_t set_bank(struct device *s, struct device_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- attr_to_bank(attr)->ctl = new;
- mce_restart();
-
- return size;
-}
-
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
-static ssize_t set_ignore_ce(struct device *s,
- struct device_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mca_cfg.ignore_ce ^ !!new) {
- if (new) {
- /* disable ce features */
- mce_timer_delete_all();
- on_each_cpu(mce_disable_cmci, NULL, 1);
- mca_cfg.ignore_ce = true;
- } else {
- /* enable ce features */
- mca_cfg.ignore_ce = false;
- on_each_cpu(mce_enable_ce, (void *)1, 1);
- }
- }
- return size;
-}
-
-static ssize_t set_cmci_disabled(struct device *s,
- struct device_attribute *attr,
- const char *buf, size_t size)
-{
- u64 new;
-
- if (strict_strtoull(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (mca_cfg.cmci_disabled ^ !!new) {
- if (new) {
- /* disable cmci */
- on_each_cpu(mce_disable_cmci, NULL, 1);
- mca_cfg.cmci_disabled = true;
- } else {
- /* enable cmci */
- mca_cfg.cmci_disabled = false;
- on_each_cpu(mce_enable_ce, NULL, 1);
- }
- }
- return size;
-}
-
-static ssize_t store_int_with_restart(struct device *s,
- struct device_attribute *attr,
- const char *buf, size_t size)
-{
- ssize_t ret = device_store_int(s, attr, buf, size);
- mce_restart();
- return ret;
-}
-
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
-static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
-static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
-static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
-
-static struct dev_ext_attribute dev_attr_check_interval = {
- __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
- &check_interval
-};
-
-static struct dev_ext_attribute dev_attr_ignore_ce = {
- __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
- &mca_cfg.ignore_ce
-};
-
-static struct dev_ext_attribute dev_attr_cmci_disabled = {
- __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
- &mca_cfg.cmci_disabled
-};
-
-static struct device_attribute *mce_device_attrs[] = {
- &dev_attr_tolerant.attr,
- &dev_attr_check_interval.attr,
- &dev_attr_trigger,
- &dev_attr_monarch_timeout.attr,
- &dev_attr_dont_log_ce.attr,
- &dev_attr_ignore_ce.attr,
- &dev_attr_cmci_disabled.attr,
- NULL
-};
-
-static cpumask_var_t mce_device_initialized;
-
-static void mce_device_release(struct device *dev)
-{
- kfree(dev);
-}
-
-/* Per cpu device init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_device_create(unsigned int cpu)
-{
- struct device *dev;
- int err;
- int i, j;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- dev = kzalloc(sizeof *dev, GFP_KERNEL);
- if (!dev)
- return -ENOMEM;
- dev->id = cpu;
- dev->bus = &mce_subsys;
- dev->release = &mce_device_release;
-
- err = device_register(dev);
- if (err)
- return err;
-
- for (i = 0; mce_device_attrs[i]; i++) {
- err = device_create_file(dev, mce_device_attrs[i]);
- if (err)
- goto error;
- }
- for (j = 0; j < mca_cfg.banks; j++) {
- err = device_create_file(dev, &mce_banks[j].attr);
- if (err)
- goto error2;
- }
- cpumask_set_cpu(cpu, mce_device_initialized);
- per_cpu(mce_device, cpu) = dev;
-
- return 0;
-error2:
- while (--j >= 0)
- device_remove_file(dev, &mce_banks[j].attr);
-error:
- while (--i >= 0)
- device_remove_file(dev, mce_device_attrs[i]);
-
- device_unregister(dev);
-
- return err;
-}
-
-static __cpuinit void mce_device_remove(unsigned int cpu)
-{
- struct device *dev = per_cpu(mce_device, cpu);
- int i;
-
- if (!cpumask_test_cpu(cpu, mce_device_initialized))
- return;
-
- for (i = 0; mce_device_attrs[i]; i++)
- device_remove_file(dev, mce_device_attrs[i]);
-
- for (i = 0; i < mca_cfg.banks; i++)
- device_remove_file(dev, &mce_banks[i].attr);
-
- device_unregister(dev);
- cpumask_clear_cpu(cpu, mce_device_initialized);
- per_cpu(mce_device, cpu) = NULL;
-}
-
-/* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuinit mce_disable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
- return;
-
- if (!(action & CPU_TASKS_FROZEN))
- cmci_clear();
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), 0);
- }
-}
-
-static void __cpuinit mce_reenable_cpu(void *h)
-{
- unsigned long action = *(unsigned long *)h;
- int i;
-
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
- return;
-
- if (!(action & CPU_TASKS_FROZEN))
- cmci_reenable();
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
-
- if (b->init)
- wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
- }
-}
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- mce_device_create(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_device_remove(cpu);
- mce_intel_hcpu_update(cpu);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- del_timer_sync(t);
- break;
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- mce_start_timer(cpu, t);
- break;
- }
-
- if (action == CPU_POST_DEAD) {
- /* intentionally ignoring frozen here */
- cmci_rediscover();
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
- .notifier_call = mce_cpu_callback,
-};
-
-static __init void mce_init_banks(void)
-{
- int i;
-
- for (i = 0; i < mca_cfg.banks; i++) {
- struct mce_bank *b = &mce_banks[i];
- struct device_attribute *a = &b->attr;
-
- sysfs_attr_init(&a->attr);
- a->attr.name = b->attrname;
- snprintf(b->attrname, ATTR_LEN, "bank%d", i);
-
- a->attr.mode = 0644;
- a->show = show_bank;
- a->store = set_bank;
- }
-}
-
-static __init int mcheck_init_device(void)
-{
- int err;
- int i = 0;
-
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
-
- mce_init_banks();
-
- err = subsys_system_register(&mce_subsys, NULL);
- if (err)
- return err;
-
- for_each_online_cpu(i) {
- err = mce_device_create(i);
- if (err)
- return err;
- }
-
- register_syscore_ops(&mce_syscore_ops);
- register_hotcpu_notifier(&mce_cpu_notifier);
-
- /* register character device /dev/mcelog */
- misc_register(&mce_chrdev_device);
-
- return err;
-}
-device_initcall_sync(mcheck_init_device);
-
-/*
- * Old style boot options parsing. Only for compatibility.
- */
-static int __init mcheck_disable(char *str)
-{
- mca_cfg.disabled = true;
- return 1;
-}
-__setup("nomce", mcheck_disable);
-
-#ifdef CONFIG_DEBUG_FS
-struct dentry *mce_get_debugfs_dir(void)
-{
- static struct dentry *dmce;
-
- if (!dmce)
- dmce = debugfs_create_dir("mce", NULL);
-
- return dmce;
-}
-
-static void mce_reset(void)
-{
- cpu_missing = 0;
- atomic_set(&mce_fake_paniced, 0);
- atomic_set(&mce_executing, 0);
- atomic_set(&mce_callin, 0);
- atomic_set(&global_nwo, 0);
-}
-
-static int fake_panic_get(void *data, u64 *val)
-{
- *val = fake_panic;
- return 0;
-}
-
-static int fake_panic_set(void *data, u64 val)
-{
- mce_reset();
- fake_panic = val;
- return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
- fake_panic_set, "%llu\n");
-
-static int __init mcheck_debugfs_init(void)
-{
- struct dentry *dmce, *ffake_panic;
-
- dmce = mce_get_debugfs_dir();
- if (!dmce)
- return -ENOMEM;
- ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
- &fake_panic_fops);
- if (!ffake_panic)
- return -ENOMEM;
-
- return 0;
-}
-late_initcall(mcheck_debugfs_init);
-#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
deleted file mode 100644
index 9cb52767999a..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
- * (c) 2005-2012 Advanced Micro Devices, Inc.
- * Your use of this code is subject to the terms and conditions of the
- * GNU general public license version 2. See "COPYING" or
- * http://www.gnu.org/licenses/gpl.html
- *
- * Written by Jacob Shin - AMD, Inc.
- *
- * Maintained by: Borislav Petkov <bp@alien8.de>
- *
- * April 2006
- * - added support for AMD Family 0x10 processors
- * May 2012
- * - major scrubbing
- *
- * All MC4_MISCi registers are shared between multi-cores
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/kobject.h>
-#include <linux/percpu.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/sysfs.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-
-#include <asm/amd_nb.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-
-#define NR_BLOCKS 9
-#define THRESHOLD_MAX 0xFFF
-#define INT_TYPE_APIC 0x00020000
-#define MASK_VALID_HI 0x80000000
-#define MASK_CNTP_HI 0x40000000
-#define MASK_LOCKED_HI 0x20000000
-#define MASK_LVTOFF_HI 0x00F00000
-#define MASK_COUNT_EN_HI 0x00080000
-#define MASK_INT_TYPE_HI 0x00060000
-#define MASK_OVERFLOW_HI 0x00010000
-#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_BLKPTR_LO 0xFF000000
-#define MCG_XBLK_ADDR 0xC0000400
-
-static const char * const th_names[] = {
- "load_store",
- "insn_fetch",
- "combined_unit",
- "",
- "northbridge",
- "execution_unit",
-};
-
-static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
-static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
-
-static void amd_threshold_interrupt(void);
-
-/*
- * CPU Initialization
- */
-
-struct thresh_restart {
- struct threshold_block *b;
- int reset;
- int set_lvt_off;
- int lvt_off;
- u16 old_limit;
-};
-
-static inline bool is_shared_bank(int bank)
-{
- /* Bank 4 is for northbridge reporting and is thus shared */
- return (bank == 4);
-}
-
-static const char * const bank4_names(struct threshold_block *b)
-{
- switch (b->address) {
- /* MSR4_MISC0 */
- case 0x00000413:
- return "dram";
-
- case 0xc0000408:
- return "ht_links";
-
- case 0xc0000409:
- return "l3_cache";
-
- default:
- WARN(1, "Funny MSR: 0x%08x\n", b->address);
- return "";
- }
-};
-
-
-static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
-{
- /*
- * bank 4 supports APIC LVT interrupts implicitly since forever.
- */
- if (bank == 4)
- return true;
-
- /*
- * IntP: interrupt present; if this bit is set, the thresholding
- * bank can generate APIC LVT interrupts
- */
- return msr_high_bits & BIT(28);
-}
-
-static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
-{
- int msr = (hi & MASK_LVTOFF_HI) >> 20;
-
- if (apic < 0) {
- pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
- "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
- b->bank, b->block, b->address, hi, lo);
- return 0;
- }
-
- if (apic != msr) {
- pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
- "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
- b->cpu, apic, b->bank, b->block, b->address, hi, lo);
- return 0;
- }
-
- return 1;
-};
-
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
-static void threshold_restart_bank(void *_tr)
-{
- struct thresh_restart *tr = _tr;
- u32 hi, lo;
-
- rdmsr(tr->b->address, lo, hi);
-
- if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
- tr->reset = 1; /* limit cannot be lower than err count */
-
- if (tr->reset) { /* reset err count and overflow bit */
- hi =
- (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - tr->b->threshold_limit);
- } else if (tr->old_limit) { /* change limit w/o reset */
- int new_count = (hi & THRESHOLD_MAX) +
- (tr->old_limit - tr->b->threshold_limit);
-
- hi = (hi & ~MASK_ERR_COUNT_HI) |
- (new_count & THRESHOLD_MAX);
- }
-
- /* clear IntType */
- hi &= ~MASK_INT_TYPE_HI;
-
- if (!tr->b->interrupt_capable)
- goto done;
-
- if (tr->set_lvt_off) {
- if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
- /* set new lvt offset */
- hi &= ~MASK_LVTOFF_HI;
- hi |= tr->lvt_off << 20;
- }
- }
-
- if (tr->b->interrupt_enable)
- hi |= INT_TYPE_APIC;
-
- done:
-
- hi |= MASK_COUNT_EN_HI;
- wrmsr(tr->b->address, lo, hi);
-}
-
-static void mce_threshold_block_init(struct threshold_block *b, int offset)
-{
- struct thresh_restart tr = {
- .b = b,
- .set_lvt_off = 1,
- .lvt_off = offset,
- };
-
- b->threshold_limit = THRESHOLD_MAX;
- threshold_restart_bank(&tr);
-};
-
-static int setup_APIC_mce(int reserved, int new)
-{
- if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0))
- return new;
-
- return reserved;
-}
-
-/* cpu init entry point, called from mce.c with preempt off */
-void mce_amd_feature_init(struct cpuinfo_x86 *c)
-{
- struct threshold_block b;
- unsigned int cpu = smp_processor_id();
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- int offset = -1;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MC0_MISC + bank * 4;
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
-
- address += MCG_XBLK_ADDR;
- } else
- ++address;
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI))
- continue;
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
-
- memset(&b, 0, sizeof(b));
- b.cpu = cpu;
- b.bank = bank;
- b.block = block;
- b.address = address;
- b.interrupt_capable = lvt_interrupt_supported(bank, high);
-
- if (b.interrupt_capable) {
- int new = (high & MASK_LVTOFF_HI) >> 20;
- offset = setup_APIC_mce(offset, new);
- }
-
- mce_threshold_block_init(&b, offset);
- mce_threshold_vector = amd_threshold_interrupt;
- }
- }
-}
-
-/*
- * APIC Interrupt Handler
- */
-
-/*
- * threshold interrupt handler will service THRESHOLD_APIC_VECTOR.
- * the interrupt goes off when error_count reaches threshold_limit.
- * the handler will simply log mcelog w/ software defined bank number.
- */
-static void amd_threshold_interrupt(void)
-{
- u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
- struct mce m;
-
- mce_setup(&m);
-
- /* assume first bank caused it */
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, m.cpu) & (1 << bank)))
- continue;
- for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MC0_MISC + bank * 4;
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- if (rdmsr_safe(address, &low, &high))
- break;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- continue;
- else
- break;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- continue;
-
- /*
- * Log the machine check that caused the threshold
- * event.
- */
- machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
-
- if (high & MASK_OVERFLOW_HI) {
- rdmsrl(address, m.misc);
- rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
- m.status);
- m.bank = K8_MCE_THRESHOLD_BASE
- + bank * NR_BLOCKS
- + block;
- mce_log(&m);
- return;
- }
- }
- }
-}
-
-/*
- * Sysfs Interface
- */
-
-struct threshold_attr {
- struct attribute attr;
- ssize_t (*show) (struct threshold_block *, char *);
- ssize_t (*store) (struct threshold_block *, const char *, size_t count);
-};
-
-#define SHOW_FIELDS(name) \
-static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long) b->name); \
-}
-SHOW_FIELDS(interrupt_enable)
-SHOW_FIELDS(threshold_limit)
-
-static ssize_t
-store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (!b->interrupt_capable)
- return -EINVAL;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- b->interrupt_enable = !!new;
-
- memset(&tr, 0, sizeof(tr));
- tr.b = b;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-static ssize_t
-store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
-{
- struct thresh_restart tr;
- unsigned long new;
-
- if (strict_strtoul(buf, 0, &new) < 0)
- return -EINVAL;
-
- if (new > THRESHOLD_MAX)
- new = THRESHOLD_MAX;
- if (new < 1)
- new = 1;
-
- memset(&tr, 0, sizeof(tr));
- tr.old_limit = b->threshold_limit;
- b->threshold_limit = new;
- tr.b = b;
-
- smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
-
- return size;
-}
-
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
-{
- u32 lo, hi;
-
- rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
-
- return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
- (THRESHOLD_MAX - b->threshold_limit)));
-}
-
-static struct threshold_attr error_count = {
- .attr = {.name = __stringify(error_count), .mode = 0444 },
- .show = show_error_count,
-};
-
-#define RW_ATTR(val) \
-static struct threshold_attr val = { \
- .attr = {.name = __stringify(val), .mode = 0644 }, \
- .show = show_## val, \
- .store = store_## val, \
-};
-
-RW_ATTR(interrupt_enable);
-RW_ATTR(threshold_limit);
-
-static struct attribute *default_attrs[] = {
- &threshold_limit.attr,
- &error_count.attr,
- NULL, /* possibly interrupt_enable if supported, see below */
- NULL,
-};
-
-#define to_block(k) container_of(k, struct threshold_block, kobj)
-#define to_attr(a) container_of(a, struct threshold_attr, attr)
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->show ? a->show(b, buf) : -EIO;
-
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct threshold_block *b = to_block(kobj);
- struct threshold_attr *a = to_attr(attr);
- ssize_t ret;
-
- ret = a->store ? a->store(b, buf, count) : -EIO;
-
- return ret;
-}
-
-static const struct sysfs_ops threshold_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type threshold_ktype = {
- .sysfs_ops = &threshold_ops,
- .default_attrs = default_attrs,
-};
-
-static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
- unsigned int bank,
- unsigned int block,
- u32 address)
-{
- struct threshold_block *b = NULL;
- u32 low, high;
- int err;
-
- if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS))
- return 0;
-
- if (rdmsr_safe_on_cpu(cpu, address, &low, &high))
- return 0;
-
- if (!(high & MASK_VALID_HI)) {
- if (block)
- goto recurse;
- else
- return 0;
- }
-
- if (!(high & MASK_CNTP_HI) ||
- (high & MASK_LOCKED_HI))
- goto recurse;
-
- b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
- if (!b)
- return -ENOMEM;
-
- b->block = block;
- b->bank = bank;
- b->cpu = cpu;
- b->address = address;
- b->interrupt_enable = 0;
- b->interrupt_capable = lvt_interrupt_supported(bank, high);
- b->threshold_limit = THRESHOLD_MAX;
-
- if (b->interrupt_capable)
- threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
- else
- threshold_ktype.default_attrs[2] = NULL;
-
- INIT_LIST_HEAD(&b->miscj);
-
- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
- list_add(&b->miscj,
- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- } else {
- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
- }
-
- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
- per_cpu(threshold_banks, cpu)[bank]->kobj,
- (bank == 4 ? bank4_names(b) : th_names[bank]));
- if (err)
- goto out_free;
-recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
-
- err = allocate_threshold_blocks(cpu, bank, ++block, address);
- if (err)
- goto out_free;
-
- if (b)
- kobject_uevent(&b->kobj, KOBJ_ADD);
-
- return err;
-
-out_free:
- if (b) {
- kobject_put(&b->kobj);
- list_del(&b->miscj);
- kfree(b);
- }
- return err;
-}
-
-static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
-{
- struct list_head *head = &b->blocks->miscj;
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- int err = 0;
-
- err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
- if (err)
- return err;
-
- list_for_each_entry_safe(pos, tmp, head, miscj) {
-
- err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
- if (err) {
- list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
- kobject_del(&pos->kobj);
-
- return err;
- }
- }
- return err;
-}
-
-static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
-{
- struct device *dev = per_cpu(mce_device, cpu);
- struct amd_northbridge *nb = NULL;
- struct threshold_bank *b = NULL;
- const char *name = th_names[bank];
- int err = 0;
-
- if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
-
- /* threshold descriptor already initialized on this node? */
- if (nb && nb->bank4) {
- /* yes, use it */
- b = nb->bank4;
- err = kobject_add(b->kobj, &dev->kobj, name);
- if (err)
- goto out;
-
- per_cpu(threshold_banks, cpu)[bank] = b;
- atomic_inc(&b->cpus);
-
- err = __threshold_add_blocks(b);
-
- goto out;
- }
- }
-
- b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
- if (!b) {
- err = -ENOMEM;
- goto out;
- }
-
- b->kobj = kobject_create_and_add(name, &dev->kobj);
- if (!b->kobj) {
- err = -EINVAL;
- goto out_free;
- }
-
- per_cpu(threshold_banks, cpu)[bank] = b;
-
- if (is_shared_bank(bank)) {
- atomic_set(&b->cpus, 1);
-
- /* nb is already initialized, see above */
- if (nb) {
- WARN_ON(nb->bank4);
- nb->bank4 = b;
- }
- }
-
- err = allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
- if (!err)
- goto out;
-
- out_free:
- kfree(b);
-
- out:
- return err;
-}
-
-/* create dir/files for all valid threshold banks */
-static __cpuinit int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- struct threshold_bank **bp;
- int err = 0;
-
- bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
- GFP_KERNEL);
- if (!bp)
- return -ENOMEM;
-
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- return err;
- }
-
- return err;
-}
-
-static void deallocate_threshold_block(unsigned int cpu,
- unsigned int bank)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
-
- if (!head)
- return;
-
- list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
- kobject_put(&pos->kobj);
- list_del(&pos->miscj);
- kfree(pos);
- }
-
- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
-}
-
-static void __threshold_remove_blocks(struct threshold_bank *b)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
-
- kobject_del(b->kobj);
-
- list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
- kobject_del(&pos->kobj);
-}
-
-static void threshold_remove_bank(unsigned int cpu, int bank)
-{
- struct amd_northbridge *nb;
- struct threshold_bank *b;
-
- b = per_cpu(threshold_banks, cpu)[bank];
- if (!b)
- return;
-
- if (!b->blocks)
- goto free_out;
-
- if (is_shared_bank(bank)) {
- if (!atomic_dec_and_test(&b->cpus)) {
- __threshold_remove_blocks(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
- return;
- } else {
- /*
- * the last CPU on this node using the shared bank is
- * going away, remove that bank now.
- */
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
- nb->bank4 = NULL;
- }
- }
-
- deallocate_threshold_block(cpu, bank);
-
-free_out:
- kobject_del(b->kobj);
- kobject_put(b->kobj);
- kfree(b);
- per_cpu(threshold_banks, cpu)[bank] = NULL;
-}
-
-static void threshold_remove_device(unsigned int cpu)
-{
- unsigned int bank;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- threshold_remove_bank(cpu, bank);
- }
- kfree(per_cpu(threshold_banks, cpu));
-}
-
-/* get notified when a cpu comes on/off */
-static void __cpuinit
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
-{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
- }
-}
-
-static __init int threshold_init_device(void)
-{
- unsigned lcpu = 0;
-
- /* to hit CPUs online before the notifier is up */
- for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
-
- if (err)
- return err;
- }
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
-
- return 0;
-}
-/*
- * there are 3 funcs which need to be _initcalled in a logic sequence:
- * 1. xen_late_init_mcelog
- * 2. mcheck_init_device
- * 3. threshold_init_device
- *
- * xen_late_init_mcelog must register xen_mce_chrdev_device before
- * native mce_chrdev_device registration if running under xen platform;
- *
- * mcheck_init_device should be inited before threshold_init_device to
- * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
- *
- * so we use following _initcalls
- * 1. device_initcall(xen_late_init_mcelog);
- * 2. device_initcall_sync(mcheck_init_device);
- * 3. late_initcall(threshold_init_device);
- *
- * when running under xen, the initcall order is 1,2,3;
- * on baremetal, we skip 1 and we do only 2 and 3.
- */
-late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
deleted file mode 100644
index d56405309dc1..000000000000
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Intel specific MCE features.
- * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
- * Copyright (C) 2008, 2009 Intel Corporation
- * Author: Andi Kleen
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include <asm/apic.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/mce.h>
-
-#include "mce-internal.h"
-
-/*
- * Support for Intel Correct Machine Check Interrupts. This allows
- * the CPU to raise an interrupt when a corrected machine check happened.
- * Normally we pick those up using a regular polling timer.
- * Also supports reliable discovery of shared banks.
- */
-
-/*
- * CMCI can be delivered to multiple cpus that share a machine check bank
- * so we need to designate a single cpu to process errors logged in each bank
- * in the interrupt handler (otherwise we would have many races and potential
- * double reporting of the same error).
- * Note that this can change when a cpu is offlined or brought online since
- * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
- * disables CMCI on all banks owned by the cpu and clears this bitfield. At
- * this point, cmci_rediscover() kicks in and a different cpu may end up
- * taking ownership of some of the shared MCA banks that were previously
- * owned by the offlined cpu.
- */
-static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
-
-/*
- * cmci_discover_lock protects against parallel discovery attempts
- * which could race against each other.
- */
-static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-
-#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
-#define CMCI_STORM_THRESHOLD 15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
-
-static atomic_t cmci_storm_on_cpus;
-
-static int cmci_supported(int *banks)
-{
- u64 cap;
-
- if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
- return 0;
-
- /*
- * Vendor check is not strictly needed, but the initial
- * initialization is vendor keyed and this
- * makes sure none of the backdoors are entered otherwise.
- */
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return 0;
- if (!cpu_has_apic || lapic_get_maxlvt() < 6)
- return 0;
- rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
- return !!(cap & MCG_CMCI_P);
-}
-
-void mce_intel_cmci_poll(void)
-{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return;
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
-}
-
-void mce_intel_hcpu_update(unsigned long cpu)
-{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
-
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
-}
-
-unsigned long mce_intel_adjust_timer(unsigned long interval)
-{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
-
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the
- * timer interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
- /* FALLTHROUGH */
-
- case CMCI_STORM_SUBSIDED:
- /*
- * We wait for all cpus to go back to SUBSIDED
- * state. When that happens we switch back to
- * interrupt mode.
- */
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_reenable();
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
- /*
- * We have shiny weather. Let the poll do whatever it
- * thinks.
- */
- return interval;
- }
-}
-
-static bool cmci_storm_detect(void)
-{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
-
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
-
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
- }
- __this_cpu_write(cmci_storm_cnt, cnt);
-
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
-
- cmci_clear();
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
-
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
-
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- mce_notify_irq();
-}
-
-/*
- * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
- * on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
- */
-static void cmci_discover(int banks)
-{
- unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
- unsigned long flags;
- int i;
- int bios_wrong_thresh = 0;
-
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- u64 val;
- int bios_zero_thresh = 0;
-
- if (test_bit(i, owned))
- continue;
-
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
- }
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
- pr_info_once(
- "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
- pr_info_once(
- "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
- }
-}
-
-/*
- * Just in case we missed an event during initialization check
- * all the CMCI owned banks.
- */
-void cmci_recheck(void)
-{
- unsigned long flags;
- int banks;
-
- if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
- return;
- local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
- local_irq_restore(flags);
-}
-
-/*
- * Disable CMCI on this CPU for all banks it owns when it goes down.
- * This allows other CPUs to claim the banks on rediscovery.
- */
-void cmci_clear(void)
-{
- unsigned long flags;
- int i;
- int banks;
- u64 val;
-
- if (!cmci_supported(&banks))
- return;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-static void cmci_rediscover_work_func(void *arg)
-{
- int banks;
-
- /* Recheck banks in case CPUs don't all have the same */
- if (cmci_supported(&banks))
- cmci_discover(banks);
-}
-
-/* After a CPU went down cycle through all the others and rediscover */
-void cmci_rediscover(void)
-{
- int banks;
-
- if (!cmci_supported(&banks))
- return;
-
- on_each_cpu(cmci_rediscover_work_func, NULL, 1);
-}
-
-/*
- * Reenable CMCI on this CPU in case a CPU down failed.
- */
-void cmci_reenable(void)
-{
- int banks;
- if (cmci_supported(&banks))
- cmci_discover(banks);
-}
-
-static void intel_init_cmci(void)
-{
- int banks;
-
- if (!cmci_supported(&banks))
- return;
-
- mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks);
- /*
- * For CPU #0 this runs with still disabled APIC, but that's
- * ok because only the vector is set up. We still do another
- * check for the banks later for CPU #0 just to make sure
- * to not miss any events.
- */
- apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
- cmci_recheck();
-}
-
-void mce_intel_feature_init(struct cpuinfo_x86 *c)
-{
- intel_init_thermal(c);
- intel_init_cmci();
-}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
deleted file mode 100644
index 41e8e00a6637..000000000000
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ /dev/null
@@ -1,584 +0,0 @@
-/*
- * Thermal throttle event support code (such as syslog messaging and rate
- * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
- *
- * This allows consistent reporting of CPU thermal throttle events.
- *
- * Maintains a counter in /sys that keeps track of the number of thermal
- * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
- *
- * Author: Dmitriy Zavin (dmitriyz@google.com)
- *
- * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
- * Inspired by Ross Biro's and Al Borchers' counter code.
- */
-#include <linux/interrupt.h>
-#include <linux/notifier.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/percpu.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/processor.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/msr.h>
-#include <asm/trace/irq_vectors.h>
-
-/* How long to wait between reporting thermal events */
-#define CHECK_INTERVAL (300 * HZ)
-
-#define THERMAL_THROTTLING_EVENT 0
-#define POWER_LIMIT_EVENT 1
-
-/*
- * Current thermal event state:
- */
-struct _thermal_state {
- bool new_event;
- int event;
- u64 next_check;
- unsigned long count;
- unsigned long last_count;
-};
-
-struct thermal_state {
- struct _thermal_state core_throttle;
- struct _thermal_state core_power_limit;
- struct _thermal_state package_throttle;
- struct _thermal_state package_power_limit;
- struct _thermal_state core_thresh0;
- struct _thermal_state core_thresh1;
- struct _thermal_state pkg_thresh0;
- struct _thermal_state pkg_thresh1;
-};
-
-/* Callback to handle core threshold interrupts */
-int (*platform_thermal_notify)(__u64 msr_val);
-EXPORT_SYMBOL(platform_thermal_notify);
-
-/* Callback to handle core package threshold_interrupts */
-int (*platform_thermal_package_notify)(__u64 msr_val);
-EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
-
-/* Callback support of rate control, return true, if
- * callback has rate control */
-bool (*platform_thermal_package_rate_control)(void);
-EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
-
-
-static DEFINE_PER_CPU(struct thermal_state, thermal_state);
-
-static atomic_t therm_throt_en = ATOMIC_INIT(0);
-
-static u32 lvtthmr_init __read_mostly;
-
-#ifdef CONFIG_SYSFS
-#define define_therm_throt_device_one_ro(_name) \
- static DEVICE_ATTR(_name, 0444, \
- therm_throt_device_show_##_name, \
- NULL) \
-
-#define define_therm_throt_device_show_func(event, name) \
- \
-static ssize_t therm_throt_device_show_##event##_##name( \
- struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
-{ \
- unsigned int cpu = dev->id; \
- ssize_t ret; \
- \
- preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) { \
- ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).event.name); \
- } else \
- ret = 0; \
- preempt_enable(); \
- \
- return ret; \
-}
-
-define_therm_throt_device_show_func(core_throttle, count);
-define_therm_throt_device_one_ro(core_throttle_count);
-
-define_therm_throt_device_show_func(core_power_limit, count);
-define_therm_throt_device_one_ro(core_power_limit_count);
-
-define_therm_throt_device_show_func(package_throttle, count);
-define_therm_throt_device_one_ro(package_throttle_count);
-
-define_therm_throt_device_show_func(package_power_limit, count);
-define_therm_throt_device_one_ro(package_power_limit_count);
-
-static struct attribute *thermal_throttle_attrs[] = {
- &dev_attr_core_throttle_count.attr,
- NULL
-};
-
-static struct attribute_group thermal_attr_group = {
- .attrs = thermal_throttle_attrs,
- .name = "thermal_throttle"
-};
-#endif /* CONFIG_SYSFS */
-
-#define CORE_LEVEL 0
-#define PACKAGE_LEVEL 1
-
-/***
- * therm_throt_process - Process thermal throttling event from interrupt
- * @curr: Whether the condition is current or not (boolean), since the
- * thermal interrupt normally gets called both when the thermal
- * event begins and once the event has ended.
- *
- * This function is called by the thermal interrupt after the
- * IRQ has been acknowledged.
- *
- * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- * "timeout" from previous log message.
- * 1 : Event should be logged further, and a message has been
- * printed to the syslog.
- */
-static int therm_throt_process(bool new_event, int event, int level)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- bool old_event;
- u64 now;
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
-
- now = get_jiffies_64();
- if (level == CORE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->core_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->core_power_limit;
- else
- return 0;
- } else if (level == PACKAGE_LEVEL) {
- if (event == THERMAL_THROTTLING_EVENT)
- state = &pstate->package_throttle;
- else if (event == POWER_LIMIT_EVENT)
- state = &pstate->package_power_limit;
- else
- return 0;
- } else
- return 0;
-
- old_event = state->new_event;
- state->new_event = new_event;
-
- if (new_event)
- state->count++;
-
- if (time_before64(now, state->next_check) &&
- state->count != state->last_count)
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
- state->last_count = state->count;
-
- /* if we just entered the thermal event */
- if (new_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package",
- state->count);
- return 1;
- }
- if (old_event) {
- if (event == THERMAL_THROTTLING_EVENT)
- printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
- this_cpu,
- level == CORE_LEVEL ? "Core" : "Package");
- return 1;
- }
-
- return 0;
-}
-
-static int thresh_event_valid(int level, int event)
-{
- struct _thermal_state *state;
- unsigned int this_cpu = smp_processor_id();
- struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- u64 now = get_jiffies_64();
-
- if (level == PACKAGE_LEVEL)
- state = (event == 0) ? &pstate->pkg_thresh0 :
- &pstate->pkg_thresh1;
- else
- state = (event == 0) ? &pstate->core_thresh0 :
- &pstate->core_thresh1;
-
- if (time_before64(now, state->next_check))
- return 0;
-
- state->next_check = now + CHECK_INTERVAL;
-
- return 1;
-}
-
-static bool int_pln_enable;
-static int __init int_pln_enable_setup(char *s)
-{
- int_pln_enable = true;
-
- return 1;
-}
-__setup("int_pln_enable", int_pln_enable_setup);
-
-#ifdef CONFIG_SYSFS
-/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct device *dev,
- unsigned int cpu)
-{
- int err;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
- if (err)
- return err;
-
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_core_power_limit_count.attr,
- thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_throttle_count.attr,
- thermal_attr_group.name);
- if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- err = sysfs_add_file_to_group(&dev->kobj,
- &dev_attr_package_power_limit_count.attr,
- thermal_attr_group.name);
- }
-
- return err;
-}
-
-static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
-{
- sysfs_remove_group(&dev->kobj, &thermal_attr_group);
-}
-
-/* Mutex protecting device creation against CPU hotplug: */
-static DEFINE_MUTEX(therm_cpu_lock);
-
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
- int err = 0;
-
- dev = get_cpu_device(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(dev, cpu);
- mutex_unlock(&therm_cpu_lock);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(dev);
- mutex_unlock(&therm_cpu_lock);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
-{
- .notifier_call = thermal_throttle_cpu_callback,
-};
-
-static __init int thermal_throttle_init_device(void)
-{
- unsigned int cpu = 0;
- int err;
-
- if (!atomic_read(&therm_throt_en))
- return 0;
-
- register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
-
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&therm_cpu_lock);
-#endif
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
- WARN_ON(err);
- }
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_unlock(&therm_cpu_lock);
-#endif
-
- return 0;
-}
-device_initcall(thermal_throttle_init_device);
-
-#endif /* CONFIG_SYSFS */
-
-static void notify_package_thresholds(__u64 msr_val)
-{
- bool notify_thres_0 = false;
- bool notify_thres_1 = false;
-
- if (!platform_thermal_package_notify)
- return;
-
- /* lower threshold check */
- if (msr_val & THERM_LOG_THRESHOLD0)
- notify_thres_0 = true;
- /* higher threshold check */
- if (msr_val & THERM_LOG_THRESHOLD1)
- notify_thres_1 = true;
-
- if (!notify_thres_0 && !notify_thres_1)
- return;
-
- if (platform_thermal_package_rate_control &&
- platform_thermal_package_rate_control()) {
- /* Rate control is implemented in callback */
- platform_thermal_package_notify(msr_val);
- return;
- }
-
- /* lower threshold reached */
- if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
- platform_thermal_package_notify(msr_val);
- /* higher threshold reached */
- if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
- platform_thermal_package_notify(msr_val);
-}
-
-static void notify_thresholds(__u64 msr_val)
-{
- /* check whether the interrupt handler is defined;
- * otherwise simply return
- */
- if (!platform_thermal_notify)
- return;
-
- /* lower threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD0) &&
- thresh_event_valid(CORE_LEVEL, 0))
- platform_thermal_notify(msr_val);
- /* higher threshold reached */
- if ((msr_val & THERM_LOG_THRESHOLD1) &&
- thresh_event_valid(CORE_LEVEL, 1))
- platform_thermal_notify(msr_val);
-}
-
-/* Thermal transition interrupt handler */
-static void intel_thermal_interrupt(void)
-{
- __u64 msr_val;
-
- rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
-
- /* Check for violation of core thermal thresholds*/
- notify_thresholds(msr_val);
-
- if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- CORE_LEVEL) != 0)
- mce_log_therm_throt_event(msr_val);
-
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- CORE_LEVEL);
-
- if (this_cpu_has(X86_FEATURE_PTS)) {
- rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- /* check violations of package thermal thresholds */
- notify_package_thresholds(msr_val);
- therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL);
- if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
- therm_throt_process(msr_val &
- PACKAGE_THERM_STATUS_POWER_LIMIT,
- POWER_LIMIT_EVENT,
- PACKAGE_LEVEL);
- }
-}
-
-static void unexpected_thermal_interrupt(void)
-{
- printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
- smp_processor_id());
-}
-
-static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-
-static inline void __smp_thermal_interrupt(void)
-{
- inc_irq_stat(irq_thermal_count);
- smp_thermal_vector();
-}
-
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- __smp_thermal_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs)
-{
- entering_irq();
- trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
- __smp_thermal_interrupt();
- trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- exiting_ack_irq();
-}
-
-/* Thermal monitoring depends on APIC, ACPI and clock modulation */
-static int intel_thermal_supported(struct cpuinfo_x86 *c)
-{
- if (!cpu_has_apic)
- return 0;
- if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
- return 0;
- return 1;
-}
-
-void __init mcheck_intel_therm_init(void)
-{
- /*
- * This function is only called on boot CPU. Save the init thermal
- * LVT value on BSP and use that value to restore APs' thermal LVT
- * entry BIOS programmed later
- */
- if (intel_thermal_supported(&boot_cpu_data))
- lvtthmr_init = apic_read(APIC_LVTTHMR);
-}
-
-void intel_init_thermal(struct cpuinfo_x86 *c)
-{
- unsigned int cpu = smp_processor_id();
- int tm2 = 0;
- u32 l, h;
-
- if (!intel_thermal_supported(c))
- return;
-
- /*
- * First check if its enabled already, in which case there might
- * be some SMM goo which handles it, so we can't even put a handler
- * since it might be delivered via SMI already:
- */
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
- h = lvtthmr_init;
- /*
- * The initial value of thermal LVT entries on all APs always reads
- * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
- * sequence to them and LVT registers are reset to 0s except for
- * the mask bits which are set to 1s when APs receive INIT IPI.
- * If BIOS takes over the thermal interrupt and sets its interrupt
- * delivery mode to SMI (not fixed), it restores the value that the
- * BIOS has programmed on AP based on BSP's info we saved since BIOS
- * is always setting the same value for all threads/cores.
- */
- if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
- apic_write(APIC_LVTTHMR, lvtthmr_init);
-
-
- if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
- return;
- }
-
- /* Check whether a vector already exists */
- if (h & APIC_VECTOR_MASK) {
- printk(KERN_DEBUG
- "CPU%d: Thermal LVT vector (%#x) already installed\n",
- cpu, (h & APIC_VECTOR_MASK));
- return;
- }
-
- /* early Pentium M models use different method for enabling TM2 */
- if (cpu_has(c, X86_FEATURE_TM2)) {
- if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
- rdmsr(MSR_THERM2_CTL, l, h);
- if (l & MSR_THERM2_CTL_TM_SELECT)
- tm2 = 1;
- } else if (l & MSR_IA32_MISC_ENABLE_TM2)
- tm2 = 1;
- }
-
- /* We'll mask the thermal vector in the lapic till we're ready: */
- h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
- apic_write(APIC_LVTTHMR, h);
-
- rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- (l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE
- | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
-
- if (cpu_has(c, X86_FEATURE_PTS)) {
- rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
- if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- (l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE))
- & ~PACKAGE_THERM_INT_PLN_ENABLE, h);
- else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE
- | PACKAGE_THERM_INT_PLN_ENABLE), h);
- else
- wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
- l | (PACKAGE_THERM_INT_LOW_ENABLE
- | PACKAGE_THERM_INT_HIGH_ENABLE), h);
- }
-
- smp_thermal_vector = intel_thermal_interrupt;
-
- rdmsr(MSR_IA32_MISC_ENABLE, l, h);
- wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
-
- /* Unmask the thermal vector: */
- l = apic_read(APIC_LVTTHMR);
- apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-
- printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
- tm2 ? "TM2" : "TM1");
-
- /* enable thermal throttle processing */
- atomic_set(&therm_throt_en, 1);
-}
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
deleted file mode 100644
index fe6b1c86645b..000000000000
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Common corrected MCE threshold handler code:
- */
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-
-#include <asm/irq_vectors.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/mce.h>
-#include <asm/trace/irq_vectors.h>
-
-static void default_threshold_interrupt(void)
-{
- printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
- THRESHOLD_APIC_VECTOR);
-}
-
-void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-
-static inline void __smp_threshold_interrupt(void)
-{
- inc_irq_stat(irq_threshold_count);
- mce_threshold_vector();
-}
-
-asmlinkage void smp_threshold_interrupt(void)
-{
- entering_irq();
- __smp_threshold_interrupt();
- exiting_ack_irq();
-}
-
-asmlinkage void smp_trace_threshold_interrupt(void)
-{
- entering_irq();
- trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
- __smp_threshold_interrupt();
- trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- exiting_ack_irq();
-}
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..193d98b33a0a
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_CPU_SUP_INTEL) += intel.o
+microcode-$(CONFIG_CPU_SUP_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
new file mode 100644
index 000000000000..3821a985f4ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -0,0 +1,1306 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD CPU Microcode Update Driver for Linux
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
+ *
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2018 Borislav Petkov <bp@alien8.de>
+ *
+ * Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ * Based on work by:
+ * Tigran Aivazian <aivazian.tigran@gmail.com>
+ *
+ * early loader:
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/bsearch.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+
+#include <crypto/sha2.h>
+
+#include <asm/microcode.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/setup.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ unsigned int size;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(microcode_cache);
+
+#define UCODE_MAGIC 0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE 0x00000001
+
+#define SECTION_HDR_SIZE 8
+#define CONTAINER_HDR_SZ 12
+
+struct equiv_cpu_entry {
+ u32 installed_cpu;
+ u32 fixed_errata_mask;
+ u32 fixed_errata_compare;
+ u16 equiv_cpu;
+ u16 res;
+} __packed;
+
+struct microcode_header_amd {
+ u32 data_code;
+ u32 patch_id;
+ u16 mc_patch_data_id;
+ u8 mc_patch_data_len;
+ u8 init_flag;
+ u32 mc_patch_data_checksum;
+ u32 nb_dev_id;
+ u32 sb_dev_id;
+ u16 processor_rev_id;
+ u8 nb_rev_id;
+ u8 sb_rev_id;
+ u8 bios_api_rev;
+ u8 reserved1[3];
+ u32 match_reg[8];
+} __packed;
+
+struct microcode_amd {
+ struct microcode_header_amd hdr;
+ unsigned int mpb[];
+};
+
+static struct equiv_cpu_table {
+ unsigned int num_entries;
+ struct equiv_cpu_entry *entry;
+} equiv_table;
+
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
+ */
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
+
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/arch/x86/microcode.rst
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static u32 cpuid_to_ucode_rev(unsigned int val)
+{
+ union zen_patch_rev p = {};
+ union cpuid_1_eax c;
+
+ c.full = val;
+
+ p.stepping = c.stepping;
+ p.model = c.model;
+ p.ext_model = c.ext_model;
+ p.ext_fam = c.ext_fam;
+
+ return p.ucode_rev;
+}
+
+static u32 get_cutoff_revision(u32 rev)
+{
+ switch (rev >> 8) {
+ case 0x80012: return 0x8001277; break;
+ case 0x80082: return 0x800820f; break;
+ case 0x83010: return 0x830107c; break;
+ case 0x86001: return 0x860010e; break;
+ case 0x86081: return 0x8608108; break;
+ case 0x87010: return 0x8701034; break;
+ case 0x8a000: return 0x8a0000a; break;
+ case 0xa0010: return 0xa00107a; break;
+ case 0xa0011: return 0xa0011da; break;
+ case 0xa0012: return 0xa001243; break;
+ case 0xa0082: return 0xa00820e; break;
+ case 0xa1011: return 0xa101153; break;
+ case 0xa1012: return 0xa10124e; break;
+ case 0xa1081: return 0xa108109; break;
+ case 0xa2010: return 0xa20102f; break;
+ case 0xa2012: return 0xa201212; break;
+ case 0xa4041: return 0xa404109; break;
+ case 0xa5000: return 0xa500013; break;
+ case 0xa6012: return 0xa60120a; break;
+ case 0xa7041: return 0xa704109; break;
+ case 0xa7052: return 0xa705208; break;
+ case 0xa7080: return 0xa708009; break;
+ case 0xa70c0: return 0xa70C009; break;
+ case 0xaa001: return 0xaa00116; break;
+ case 0xaa002: return 0xaa00218; break;
+ case 0xb0021: return 0xb002146; break;
+ case 0xb0081: return 0xb008111; break;
+ case 0xb1010: return 0xb101046; break;
+ case 0xb2040: return 0xb204031; break;
+ case 0xb4040: return 0xb404031; break;
+ case 0xb4041: return 0xb404101; break;
+ case 0xb6000: return 0xb600031; break;
+ case 0xb6080: return 0xb608031; break;
+ case 0xb7000: return 0xb700031; break;
+ default: break;
+
+ }
+ return 0;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ u32 cutoff;
+
+ if (!cur_rev) {
+ cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+ pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev);
+ }
+
+ cutoff = get_cutoff_revision(cur_rev);
+ if (cutoff)
+ return cur_rev <= cutoff;
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool cpu_has_entrysign(void)
+{
+ unsigned int fam = x86_family(bsp_cpuid_1_eax);
+ unsigned int model = x86_model(bsp_cpuid_1_eax);
+
+ if (fam == 0x17 || fam == 0x19)
+ return true;
+
+ if (fam == 0x1a) {
+ if (model <= 0x2f ||
+ (0x40 <= model && model <= 0x4f) ||
+ (0x60 <= model && model <= 0x6f))
+ return true;
+ }
+
+ return false;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ int i;
+
+ if (!cpu_has_entrysign())
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256(data, len, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ int cpu = smp_processor_id();
+
+ if (!microcode_rev[cpu]) {
+ if (!base_rev)
+ base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax);
+
+ microcode_rev[cpu] = base_rev;
+
+ ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev);
+ }
+
+ return microcode_rev[cpu];
+ }
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
+{
+ unsigned int i;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
+ if (!et || !et->num_entries)
+ return 0;
+
+ for (i = 0; i < et->num_entries; i++) {
+ struct equiv_cpu_entry *e = &et->entry[i];
+
+ if (sig == e->installed_cpu)
+ return e->equiv_cpu;
+ }
+ return 0;
+}
+
+/*
+ * Check whether there is a valid microcode container file at the beginning
+ * of @buf of size @buf_size.
+ */
+static bool verify_container(const u8 *buf, size_t buf_size)
+{
+ u32 cont_magic;
+
+ if (buf_size <= CONTAINER_HDR_SZ) {
+ ucode_dbg("Truncated microcode container header.\n");
+ return false;
+ }
+
+ cont_magic = *(const u32 *)buf;
+ if (cont_magic != UCODE_MAGIC) {
+ ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic);
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated CPU equivalence table at the
+ * beginning of @buf of size @buf_size.
+ */
+static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
+{
+ const u32 *hdr = (const u32 *)buf;
+ u32 cont_type, equiv_tbl_len;
+
+ if (!verify_container(buf, buf_size))
+ return false;
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
+ cont_type = hdr[1];
+ if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
+ ucode_dbg("Wrong microcode container equivalence table type: %u.\n",
+ cont_type);
+ return false;
+ }
+
+ buf_size -= CONTAINER_HDR_SZ;
+
+ equiv_tbl_len = hdr[2];
+ if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) ||
+ buf_size < equiv_tbl_len) {
+ ucode_dbg("Truncated equivalence table.\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Check whether there is a valid, non-truncated microcode patch section at the
+ * beginning of @buf of size @buf_size.
+ *
+ * On success, @sh_psize returns the patch size according to the section header,
+ * to the caller.
+ */
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+{
+ u32 p_type, p_size;
+ const u32 *hdr;
+
+ if (buf_size < SECTION_HDR_SIZE) {
+ ucode_dbg("Truncated patch section.\n");
+ return false;
+ }
+
+ hdr = (const u32 *)buf;
+ p_type = hdr[0];
+ p_size = hdr[1];
+
+ if (p_type != UCODE_UCODE_TYPE) {
+ ucode_dbg("Invalid type field (0x%x) in container file section header.\n",
+ p_type);
+ return false;
+ }
+
+ if (p_size < sizeof(struct microcode_header_amd)) {
+ ucode_dbg("Patch of size %u too short.\n", p_size);
+ return false;
+ }
+
+ *sh_psize = p_size;
+
+ return true;
+}
+
+/*
+ * Check whether the passed remaining file @buf_size is large enough to contain
+ * a patch of the indicated @sh_psize (and also whether this size does not
+ * exceed the per-family maximum). @sh_psize is the size read from the section
+ * header.
+ */
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ u32 max_size;
+
+ if (family >= 0x15)
+ goto ret;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+
+ switch (family) {
+ case 0x10 ... 0x12:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ default:
+ WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
+ return false;
+ }
+
+ if (sh_psize > max_size)
+ return false;
+
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
+}
+
+/*
+ * Verify the patch in @buf.
+ *
+ * Returns:
+ * negative: on error
+ * positive: patch is not for this family, skip it
+ * 0: success
+ */
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
+{
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct microcode_header_amd *mc_hdr;
+ u32 cur_rev, cutoff, patch_rev;
+ u32 sh_psize;
+ u16 proc_id;
+ u8 patch_fam;
+
+ if (!__verify_patch_section(buf, buf_size, &sh_psize))
+ return -1;
+
+ /*
+ * The section header length is not included in this indicated size
+ * but is present in the leftover file length so we need to subtract
+ * it before passing this value to the function below.
+ */
+ buf_size -= SECTION_HDR_SIZE;
+
+ /*
+ * Check if the remaining buffer is big enough to contain a patch of
+ * size sh_psize, as the section claims.
+ */
+ if (buf_size < sh_psize) {
+ ucode_dbg("Patch of size %u truncated.\n", sh_psize);
+ return -1;
+ }
+
+ if (!__verify_patch_size(sh_psize, buf_size)) {
+ ucode_dbg("Per-family patch size mismatch.\n");
+ return -1;
+ }
+
+ *patch_size = sh_psize;
+
+ mc_hdr = (struct microcode_header_amd *)(buf + SECTION_HDR_SIZE);
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", mc_hdr->patch_id);
+ return -1;
+ }
+
+ proc_id = mc_hdr->processor_rev_id;
+ patch_fam = 0xf + (proc_id >> 12);
+
+ if (patch_fam != family)
+ return 1;
+
+ cur_rev = get_patch_level();
+
+ /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */
+ cutoff = get_cutoff_revision(cur_rev);
+ if (!cutoff)
+ goto ok;
+
+ patch_rev = mc_hdr->patch_id;
+
+ ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n",
+ cur_rev, cutoff, patch_rev);
+
+ if (cur_rev <= cutoff && patch_rev <= cutoff)
+ goto ok;
+
+ if (cur_rev > cutoff && patch_rev > cutoff)
+ goto ok;
+
+ return 1;
+
+ok:
+ ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam);
+
+ return 0;
+}
+
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ *
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_table table;
+ size_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
+
+ if (!verify_equivalence_table(ucode, size))
+ return 0;
+
+ buf = ucode;
+
+ table.entry = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
+ table.num_entries = hdr[2] / sizeof(struct equiv_cpu_entry);
+
+ /*
+ * Find the equivalence ID of our CPU in this table. Even if this table
+ * doesn't contain a patch for the CPU, scan through the whole container
+ * so that it can be skipped in case there are other containers appended.
+ */
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
+
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
+
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
+ int ret;
+
+ ret = verify_patch(buf, size, &patch_size);
+ if (ret < 0) {
+ /*
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
+ */
+ goto out;
+ } else if (ret > 0) {
+ goto skip;
+ }
+
+ mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
+
+ if (mc_patch_matches(mc, eq_id)) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+
+ ucode_dbg(" match: size: %d\n", patch_size);
+ }
+
+skip:
+ /* Skip patch section header too: */
+ buf += patch_size + SECTION_HDR_SIZE;
+ size -= patch_size + SECTION_HDR_SIZE;
+ }
+
+out:
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ while (size) {
+ size_t s = parse_container(ucode, size, desc);
+ if (!s)
+ return;
+
+ /* catch wraparound */
+ if (size >= s) {
+ ucode += s;
+ size -= s;
+ } else {
+ return;
+ }
+ }
+}
+
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
+{
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
+
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
+ return false;
+
+ native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr);
+
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
+
+ invlpg(p_addr);
+
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
+ }
+
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG))
+ microcode_rev[smp_processor_id()] = mc->hdr.patch_id;
+
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+
+ ucode_dbg("updated rev: 0x%x\n", *cur_rev);
+
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
+
+ return true;
+}
+
+static bool get_builtin_microcode(struct cpio_data *cp)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
+ struct firmware fw;
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ if (family >= 0x15)
+ snprintf(fw_name, sizeof(fw_name),
+ "amd-ucode/microcode_amd_fam%02hhxh.bin", family);
+
+ if (firmware_request_builtin(&fw, fw_name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+
+ return false;
+}
+
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
+{
+ struct cpio_data cp;
+ bool found;
+
+ if (!get_builtin_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
+
+ return found;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
+{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
+ struct cpio_data cp = { };
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
+
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
+ rev = get_patch_level();
+ ed->old_rev = rev;
+
+ /* Needed in load_microcode_amd() */
+ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
+
+ if (!find_blobs_in_containers(&cp))
+ return;
+
+ scan_containers(cp.data, cp.size, &desc);
+
+ mc = desc.mc;
+ if (!mc)
+ return;
+
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
+
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
+}
+
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
+
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ list_for_each_entry(p, &microcode_cache, plist)
+ if (patch_cpus_equivalent(p, &n, false))
+ return p;
+
+ return NULL;
+}
+
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+ int ret;
+
+ list_for_each_entry(p, &microcode_cache, plist) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
+ /* we already have the latest patch */
+ kfree(new_patch->data);
+ kfree(new_patch);
+ return;
+ }
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &microcode_cache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ u16 equiv_id = 0;
+
+ uci->cpu_sig.rev = get_patch_level();
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
+
+ return cache_find_patch(uci, equiv_id);
+}
+
+void reload_ucode_amd(unsigned int cpu)
+{
+ u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
+ struct ucode_patch *p;
+
+ p = find_patch(cpu);
+ if (!p)
+ return;
+
+ mc = p->data;
+
+ rev = get_patch_level();
+ if (rev < mc->hdr.patch_id) {
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
+ }
+}
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct ucode_patch *p;
+
+ csig->sig = cpuid_eax(0x00000001);
+ csig->rev = get_patch_level();
+
+ /*
+ * a patch could have been loaded early, set uci->mc so that
+ * mc_bp_resume() can call apply_microcode()
+ */
+ p = find_patch(cpu);
+ if (p && (p->patch_id == csig->rev))
+ uci->mc = p->data;
+
+ return 0;
+}
+
+static enum ucode_state apply_microcode_amd(int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+ u32 rev;
+
+ BUG_ON(raw_smp_processor_id() != cpu);
+
+ uci = ucode_cpu_info + cpu;
+
+ p = find_patch(cpu);
+ if (!p)
+ return UCODE_NFOUND;
+
+ rev = uci->cpu_sig.rev;
+
+ mc_amd = p->data;
+ uci->mc = p->data;
+
+ /* need to apply patch? */
+ if (rev > mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
+
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
+ pr_err("CPU%d: update failed for patch_level=0x%08x\n",
+ cpu, mc_amd->hdr.patch_id);
+ return UCODE_ERROR;
+ }
+
+ rev = mc_amd->hdr.patch_id;
+ ret = UCODE_UPDATED;
+
+out:
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
+
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
+ if (c->cpu_index == boot_cpu_data.cpu_index)
+ boot_cpu_data.microcode = rev;
+
+ return ret;
+}
+
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
+{
+ unsigned int cpu = smp_processor_id();
+
+ ucode_cpu_info[cpu].cpu_sig.sig = cpuid_1_eax;
+ apply_microcode_amd(cpu);
+}
+
+static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
+{
+ u32 equiv_tbl_len;
+ const u32 *hdr;
+
+ if (!verify_equivalence_table(buf, buf_size))
+ return 0;
+
+ hdr = (const u32 *)buf;
+ equiv_tbl_len = hdr[2];
+
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
+ equiv_table.entry = vmalloc(equiv_tbl_len);
+ if (!equiv_table.entry) {
+ pr_err("failed to allocate equivalent CPU table\n");
+ return 0;
+ }
+
+ memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
+ equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+
+out:
+ /* add header length */
+ return equiv_tbl_len + CONTAINER_HDR_SZ;
+}
+
+static void free_equiv_cpu_table(void)
+{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
+ vfree(equiv_table.entry);
+ memset(&equiv_table, 0, sizeof(equiv_table));
+}
+
+static void cleanup(void)
+{
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * Return a non-negative value even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
+ unsigned int *patch_size)
+{
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ u16 proc_id;
+ int ret;
+
+ ret = verify_patch(fw, leftover, patch_size);
+ if (ret)
+ return ret;
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kmemdup(fw + SECTION_HDR_SIZE, *patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+ patch->size = *patch_size;
+
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return 0;
+}
+
+/* Scan the blob in @data and add microcode patches to the cache. */
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ u8 *fw = (u8 *)data;
+ size_t offset;
+
+ offset = install_equiv_cpu_table(data, size);
+ if (!offset)
+ return UCODE_ERROR;
+
+ fw += offset;
+ size -= offset;
+
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ free_equiv_cpu_table();
+ return UCODE_ERROR;
+ }
+
+ while (size > 0) {
+ unsigned int crnt_size = 0;
+ int ret;
+
+ ret = verify_and_add_patch(family, fw, size, &crnt_size);
+ if (ret < 0)
+ return UCODE_ERROR;
+
+ fw += crnt_size + SECTION_HDR_SIZE;
+ size -= (crnt_size + SECTION_HDR_SIZE);
+ }
+
+ return UCODE_OK;
+}
+
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ enum ucode_state ret;
+
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = __load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ cleanup();
+
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
+ return ret;
+
+ for_each_node_with_cpus(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
+
+ p = find_patch(cpu);
+ if (!p)
+ continue;
+
+ if (c->microcode >= p->patch_id)
+ continue;
+
+ ret = UCODE_NEW;
+ }
+
+ return ret;
+}
+
+static int __init save_microcode_in_initrd(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ unsigned int cpuid_1_eax;
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
+/*
+ * AMD microcode firmware naming convention, up to family 15h they are in
+ * the legacy file:
+ *
+ * amd-ucode/microcode_amd.bin
+ *
+ * This legacy file is always smaller than 2K in size.
+ *
+ * Beginning with family 15h, they are in family-specific firmware files:
+ *
+ * amd-ucode/microcode_amd_fam15h.bin
+ * amd-ucode/microcode_amd_fam16h.bin
+ * ...
+ *
+ * These might be larger than 2K.
+ */
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+{
+ char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ if (force_minrev)
+ return UCODE_NFOUND;
+
+ if (c->x86 >= 0x15)
+ snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
+
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ ucode_dbg("failed to load file %s\n", fw_name);
+ goto out;
+ }
+
+ ret = UCODE_ERROR;
+ if (!verify_container(fw->data, fw->size))
+ goto fw_release;
+
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
+
+ fw_release:
+ release_firmware(fw);
+
+ out:
+ return ret;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ uci->mc = NULL;
+}
+
+static void finalize_late_load_amd(int result)
+{
+ if (result)
+ cleanup();
+}
+
+static struct microcode_ops microcode_amd_ops = {
+ .request_microcode_fw = request_microcode_amd,
+ .collect_cpu_info = collect_cpu_info_amd,
+ .apply_microcode = apply_microcode_amd,
+ .microcode_fini_cpu = microcode_fini_cpu_amd,
+ .finalize_late_load = finalize_late_load_amd,
+ .nmi_safe = true,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+ pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+ return &microcode_amd_ops;
+}
+
+void __exit exit_amd_microcode(void)
+{
+ cleanup();
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..1fd349cfc802
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,556 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa0011d7, {
+ 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b,
+ 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12,
+ 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2,
+ 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00123b, {
+ 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2,
+ 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3,
+ 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28,
+ 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa00820d, {
+ 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4,
+ 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca,
+ 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1,
+ 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10114c, {
+ 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64,
+ 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74,
+ 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a,
+ 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa10124c, {
+ 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90,
+ 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46,
+ 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc,
+ 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa108109, {
+ 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa,
+ 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b,
+ 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35,
+ 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa20102e, {
+ 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd,
+ 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6,
+ 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5,
+ 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa201211, {
+ 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95,
+ 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27,
+ 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff,
+ 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa404108, {
+ 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc,
+ 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb,
+ 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19,
+ 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa500012, {
+ 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4,
+ 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16,
+ 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f,
+ 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa60120a, {
+ 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d,
+ 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b,
+ 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d,
+ 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa704108, {
+ 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93,
+ 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98,
+ 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49,
+ 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa705208, {
+ 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19,
+ 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2,
+ 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11,
+ 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa708008, {
+ 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46,
+ 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab,
+ 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16,
+ 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xa70c008, {
+ 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21,
+ 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e,
+ 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66,
+ 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+ { 0xaa00216, {
+ 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5,
+ 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08,
+ 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2,
+ 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
new file mode 100644
index 000000000000..68049f171860
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -0,0 +1,926 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
+ *
+ * X86 CPU microcode early update for Linux:
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ * (C) 2015 Borislav Petkov <bp@alien8.de>
+ *
+ * This driver allows to upgrade microcode on x86 processors.
+ */
+
+#define pr_fmt(fmt) "microcode: " fmt
+
+#include <linux/stop_machine.h>
+#include <linux/device/faux.h>
+#include <linux/syscore_ops.h>
+#include <linux/miscdevice.h>
+#include <linux/capability.h>
+#include <linux/firmware.h>
+#include <linux/cpumask.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/processor.h>
+#include <asm/cmdline.h>
+#include <asm/msr.h>
+#include <asm/setup.h>
+
+#include "internal.h"
+
+static struct microcode_ops *microcode_ops;
+static bool dis_ucode_ldr;
+
+bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV);
+
+/*
+ * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in
+ * order to not uglify the code with ifdeffery and use IS_ENABLED()
+ * instead, leave them in. When microcode debugging is not enabled,
+ * those are meaningless anyway.
+ */
+/* base microcode revision for debugging */
+u32 base_rev;
+u32 microcode_rev[NR_CPUS] = {};
+
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - cpus_read_lock/unlock() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+struct early_load_data early_data;
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ if (x86_cpuid_vendor() != X86_VENDOR_AMD)
+ return false;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
+bool __init microcode_loader_disabled(void)
+{
+ if (dis_ucode_ldr)
+ return true;
+
+ /*
+ * Disable when:
+ *
+ * 1) The CPU does not support CPUID.
+ *
+ * 2) Bit 31 in CPUID[1]:ECX is clear
+ * The bit is reserved for hypervisor use. This is still not
+ * completely accurate as XEN PV guests don't see that CPUID bit
+ * set, but that's good enough as they don't land on the BSP
+ * path anyway.
+ *
+ * 3) Certain AMD patch levels are not allowed to be
+ * overwritten.
+ */
+ if (!cpuid_feature() ||
+ ((native_cpuid_ecx(1) & BIT(31)) &&
+ !IS_ENABLED(CONFIG_MICROCODE_DBG)) ||
+ amd_check_current_patch_level())
+ dis_ucode_ldr = true;
+
+ return dis_ucode_ldr;
+}
+
+static void __init early_parse_cmdline(void)
+{
+ char cmd_buf[64] = {};
+ char *s, *p = cmd_buf;
+
+ if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) {
+ while ((s = strsep(&p, ","))) {
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) {
+ if (strstr(s, "base_rev=")) {
+ /* advance to the option arg */
+ strsep(&s, "=");
+ if (kstrtouint(s, 16, &base_rev)) { ; }
+ }
+ }
+
+ if (!strcmp("force_minrev", s))
+ force_minrev = true;
+
+ if (!strcmp(s, "dis_ucode_ldr"))
+ dis_ucode_ldr = true;
+ }
+ }
+
+ /* old, compat option */
+ if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0)
+ dis_ucode_ldr = true;
+}
+
+void __init load_ucode_bsp(void)
+{
+ unsigned int cpuid_1_eax;
+ bool intel = true;
+
+ early_parse_cmdline();
+
+ if (microcode_loader_disabled())
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) < 6)
+ return;
+ break;
+
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) < 0x10)
+ return;
+ intel = false;
+ break;
+
+ default:
+ return;
+ }
+
+ if (intel)
+ load_ucode_intel_bsp(&early_data);
+ else
+ load_ucode_amd_bsp(&early_data, cpuid_1_eax);
+}
+
+void load_ucode_ap(void)
+{
+ unsigned int cpuid_1_eax;
+
+ /*
+ * Can't use microcode_loader_disabled() here - .init section
+ * hell. It doesn't have to either - the BSP variant must've
+ * parsed cmdline already anyway.
+ */
+ if (dis_ucode_ldr)
+ return;
+
+ cpuid_1_eax = native_cpuid_eax(1);
+
+ switch (x86_cpuid_vendor()) {
+ case X86_VENDOR_INTEL:
+ if (x86_family(cpuid_1_eax) >= 6)
+ load_ucode_intel_ap();
+ break;
+ case X86_VENDOR_AMD:
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
+ break;
+ default:
+ break;
+ }
+}
+
+struct cpio_data __init find_microcode_in_initrd(const char *path)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ size = boot_params.hdr.ramdisk_size;
+ /* Early load on BSP has a temporary mapping. */
+ if (size)
+ start = initrd_start_early;
+
+#else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+ start += PAGE_OFFSET;
+ }
+#endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ */
+ if (initrd_start)
+ start = initrd_start;
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
+static void reload_early_microcode(unsigned int cpu)
+{
+ int vendor, family;
+
+ vendor = x86_cpuid_vendor();
+ family = x86_cpuid_family();
+
+ switch (vendor) {
+ case X86_VENDOR_INTEL:
+ if (family >= 6)
+ reload_ucode_intel();
+ break;
+ case X86_VENDOR_AMD:
+ if (family >= 0x10)
+ reload_ucode_amd(cpu);
+ break;
+ default:
+ break;
+ }
+}
+
+/* fake device for request_firmware */
+static struct faux_device *microcode_fdev;
+
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ * is loading microcode in order to avoid any negative interactions caused by
+ * the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ * requirement can be relaxed in the future. Right now, this is conservative
+ * and good.
+ */
+enum sibling_ctrl {
+ /* Spinwait with timeout */
+ SCTRL_WAIT,
+ /* Invoke the microcode_apply() callback */
+ SCTRL_APPLY,
+ /* Proceed without invoking the microcode_apply() callback */
+ SCTRL_DONE,
+};
+
+struct microcode_ctrl {
+ enum sibling_ctrl ctrl;
+ enum ucode_state result;
+ unsigned int ctrl_cpu;
+ bool nmi_enabled;
+};
+
+DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
+static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
+static atomic_t late_cpus_in, offline_in_nmi;
+static unsigned int loops_per_usec;
+static cpumask_t cpu_offline_mask;
+
+static noinstr bool wait_for_cpus(atomic_t *cnt)
+{
+ unsigned int timeout, loops;
+
+ WARN_ON_ONCE(raw_atomic_dec_return(cnt) < 0);
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (!raw_atomic_read(cnt))
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ /* Prevent the late comers from making progress and let them time out */
+ raw_atomic_inc(cnt);
+ return false;
+}
+
+static noinstr bool wait_for_ctrl(void)
+{
+ unsigned int timeout, loops;
+
+ for (timeout = 0; timeout < USEC_PER_SEC; timeout++) {
+ if (raw_cpu_read(ucode_ctrl.ctrl) != SCTRL_WAIT)
+ return true;
+
+ for (loops = 0; loops < loops_per_usec; loops++)
+ cpu_relax();
+
+ /* If invoked directly, tickle the NMI watchdog */
+ if (!microcode_ops->use_nmi && !(timeout % USEC_PER_MSEC)) {
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+ }
+ }
+ return false;
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr bool load_secondary_wait(unsigned int ctrl_cpu)
+{
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ raw_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ return false;
+ }
+
+ /*
+ * Wait for primary threads to complete. If one of them hangs due
+ * to the update, there is no way out. This is non-recoverable
+ * because the CPU might hold locks or resources and confuse the
+ * scheduler, watchdogs etc. There is no way to safely evacuate the
+ * machine.
+ */
+ if (wait_for_ctrl())
+ return true;
+
+ instrumentation_begin();
+ panic("Microcode load: Primary CPU %d timed out\n", ctrl_cpu);
+ instrumentation_end();
+}
+
+/*
+ * Protected against instrumentation up to the point where the primary
+ * thread completed the update. See microcode_nmi_handler() for details.
+ */
+static noinstr void load_secondary(unsigned int cpu)
+{
+ unsigned int ctrl_cpu = raw_cpu_read(ucode_ctrl.ctrl_cpu);
+ enum ucode_state ret;
+
+ if (!load_secondary_wait(ctrl_cpu)) {
+ instrumentation_begin();
+ pr_err_once("load: %d CPUs timed out\n",
+ atomic_read(&late_cpus_in) - 1);
+ instrumentation_end();
+ return;
+ }
+
+ /* Primary thread completed. Allow to invoke instrumentable code */
+ instrumentation_begin();
+ /*
+ * If the primary succeeded then invoke the apply() callback,
+ * otherwise copy the state from the primary thread.
+ */
+ if (this_cpu_read(ucode_ctrl.ctrl) == SCTRL_APPLY)
+ ret = microcode_ops->apply_microcode(cpu);
+ else
+ ret = per_cpu(ucode_ctrl.result, ctrl_cpu);
+
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+ instrumentation_end();
+}
+
+static void __load_primary(unsigned int cpu)
+{
+ struct cpumask *secondaries = topology_sibling_cpumask(cpu);
+ enum sibling_ctrl ctrl;
+ enum ucode_state ret;
+ unsigned int sibling;
+
+ /* Initial rendezvous to ensure that all CPUs have arrived */
+ if (!wait_for_cpus(&late_cpus_in)) {
+ this_cpu_write(ucode_ctrl.result, UCODE_TIMEOUT);
+ pr_err_once("load: %d CPUs timed out\n", atomic_read(&late_cpus_in) - 1);
+ return;
+ }
+
+ ret = microcode_ops->apply_microcode(cpu);
+ this_cpu_write(ucode_ctrl.result, ret);
+ this_cpu_write(ucode_ctrl.ctrl, SCTRL_DONE);
+
+ /*
+ * If the update was successful, let the siblings run the apply()
+ * callback. If not, tell them it's done. This also covers the
+ * case where the CPU has uniform loading at package or system
+ * scope implemented but does not advertise it.
+ */
+ if (ret == UCODE_UPDATED || ret == UCODE_OK)
+ ctrl = SCTRL_APPLY;
+ else
+ ctrl = SCTRL_DONE;
+
+ for_each_cpu(sibling, secondaries) {
+ if (sibling != cpu)
+ per_cpu(ucode_ctrl.ctrl, sibling) = ctrl;
+ }
+}
+
+static bool kick_offline_cpus(unsigned int nr_offl)
+{
+ unsigned int cpu, timeout;
+
+ for_each_cpu(cpu, &cpu_offline_mask) {
+ /* Enable the rendezvous handler and send NMI */
+ per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
+ apic_send_nmi_to_offline_cpu(cpu);
+ }
+
+ /* Wait for them to arrive */
+ for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
+ if (atomic_read(&offline_in_nmi) == nr_offl)
+ return true;
+ udelay(1);
+ }
+ /* Let the others time out */
+ return false;
+}
+
+static void release_offline_cpus(void)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, &cpu_offline_mask)
+ per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
+}
+
+static void load_primary(unsigned int cpu)
+{
+ unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
+ bool proceed = true;
+
+ /* Kick soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ proceed = kick_offline_cpus(nr_offl);
+
+ /* If the soft-offlined CPUs did not respond, abort */
+ if (proceed)
+ __load_primary(cpu);
+
+ /* Unconditionally release soft-offlined SMT siblings if required */
+ if (!cpu && nr_offl)
+ release_offline_cpus();
+}
+
+/*
+ * Minimal stub rendezvous handler for soft-offlined CPUs which participate
+ * in the NMI rendezvous to protect against a concurrent NMI on affected
+ * CPUs.
+ */
+void noinstr microcode_offline_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return;
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
+ raw_atomic_inc(&offline_in_nmi);
+ wait_for_ctrl();
+}
+
+static noinstr bool microcode_update_handler(void)
+{
+ unsigned int cpu = raw_smp_processor_id();
+
+ if (raw_cpu_read(ucode_ctrl.ctrl_cpu) == cpu) {
+ instrumentation_begin();
+ load_primary(cpu);
+ instrumentation_end();
+ } else {
+ load_secondary(cpu);
+ }
+
+ instrumentation_begin();
+ touch_nmi_watchdog();
+ instrumentation_end();
+
+ return true;
+}
+
+/*
+ * Protection against instrumentation is required for CPUs which are not
+ * safe against an NMI which is delivered to the secondary SMT sibling
+ * while the primary thread updates the microcode. Instrumentation can end
+ * up in #INT3, #DB and #PF. The IRET from those exceptions reenables NMI
+ * which is the opposite of what the NMI rendezvous is trying to achieve.
+ *
+ * The primary thread is safe versus instrumentation as the actual
+ * microcode update handles this correctly. It's only the sibling code
+ * path which must be NMI safe until the primary thread completed the
+ * update.
+ */
+bool noinstr microcode_nmi_handler(void)
+{
+ if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
+ return false;
+
+ raw_cpu_write(ucode_ctrl.nmi_enabled, false);
+ return microcode_update_handler();
+}
+
+static int load_cpus_stopped(void *unused)
+{
+ if (microcode_ops->use_nmi) {
+ /* Enable the NMI handler and raise NMI */
+ this_cpu_write(ucode_ctrl.nmi_enabled, true);
+ apic->send_IPI(smp_processor_id(), NMI_VECTOR);
+ } else {
+ /* Just invoke the handler directly */
+ microcode_update_handler();
+ }
+ return 0;
+}
+
+static int load_late_stop_cpus(bool is_safe)
+{
+ unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
+ unsigned int nr_offl, offline = 0;
+ int old_rev = boot_cpu_data.microcode;
+ struct cpuinfo_x86 prev_info;
+
+ if (!is_safe) {
+ pr_err("Late microcode loading without minimal revision check.\n");
+ pr_err("You should switch to early loading, if possible.\n");
+ }
+
+ /*
+ * Pre-load the microcode image into a staging device. This
+ * process is preemptible and does not require stopping CPUs.
+ * Successful staging simplifies the subsequent late-loading
+ * process, reducing rendezvous time.
+ *
+ * Even if the transfer fails, the update will proceed as usual.
+ */
+ if (microcode_ops->use_staging)
+ microcode_ops->stage_microcode();
+
+ atomic_set(&late_cpus_in, num_online_cpus());
+ atomic_set(&offline_in_nmi, 0);
+ loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
+
+ /*
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
+ */
+ store_cpu_caps(&prev_info);
+
+ if (microcode_ops->use_nmi)
+ static_branch_enable_cpuslocked(&microcode_nmi_handler_enable);
+
+ stop_machine_cpuslocked(load_cpus_stopped, NULL, cpu_online_mask);
+
+ if (microcode_ops->use_nmi)
+ static_branch_disable_cpuslocked(&microcode_nmi_handler_enable);
+
+ /* Analyze the results */
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ switch (per_cpu(ucode_ctrl.result, cpu)) {
+ case UCODE_UPDATED: updated++; break;
+ case UCODE_TIMEOUT: timedout++; break;
+ case UCODE_OK: siblings++; break;
+ case UCODE_OFFLINE: offline++; break;
+ default: failed++; break;
+ }
+ }
+
+ if (microcode_ops->finalize_late_load)
+ microcode_ops->finalize_late_load(!updated);
+
+ if (!updated) {
+ /* Nothing changed. */
+ if (!failed && !timedout)
+ return 0;
+
+ nr_offl = cpumask_weight(&cpu_offline_mask);
+ if (offline < nr_offl) {
+ pr_warn("%u offline siblings did not respond.\n",
+ nr_offl - atomic_read(&offline_in_nmi));
+ return -EIO;
+ }
+ pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
+ failed, timedout);
+ return -EIO;
+ }
+
+ if (!is_safe || failed || timedout)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
+ pr_info("load: updated on %u primary CPUs with %u siblings\n", updated, siblings);
+ if (failed || timedout) {
+ pr_err("load incomplete. %u CPUs timed out or failed\n",
+ num_online_cpus() - (updated + siblings));
+ }
+ pr_info("revision: 0x%x -> 0x%x\n", old_rev, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+
+ return updated + siblings == num_online_cpus() ? 0 : -EIO;
+}
+
+/*
+ * This function does two things:
+ *
+ * 1) Ensure that all required CPUs which are present and have been booted
+ * once are online.
+ *
+ * To pass this check, all primary threads must be online.
+ *
+ * If the microcode load is not safe against NMI then all SMT threads
+ * must be online as well because they still react to NMIs when they are
+ * soft-offlined and parked in one of the play_dead() variants. So if a
+ * NMI hits while the primary thread updates the microcode the resulting
+ * behaviour is undefined. The default play_dead() implementation on
+ * modern CPUs uses MWAIT, which is also not guaranteed to be safe
+ * against a microcode update which affects MWAIT.
+ *
+ * As soft-offlined CPUs still react on NMIs, the SMT sibling
+ * restriction can be lifted when the vendor driver signals to use NMI
+ * for rendezvous and the APIC provides a mechanism to send an NMI to a
+ * soft-offlined CPU. The soft-offlined CPUs are then able to
+ * participate in the rendezvous in a trivial stub handler.
+ *
+ * 2) Initialize the per CPU control structure and create a cpumask
+ * which contains "offline"; secondary threads, so they can be handled
+ * correctly by a control CPU.
+ */
+static bool setup_cpus(void)
+{
+ struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
+ bool allow_smt_offline;
+ unsigned int cpu;
+
+ allow_smt_offline = microcode_ops->nmi_safe ||
+ (microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
+
+ cpumask_clear(&cpu_offline_mask);
+
+ for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
+ /*
+ * Offline CPUs sit in one of the play_dead() functions
+ * with interrupts disabled, but they still react on NMIs
+ * and execute arbitrary code. Also MWAIT being updated
+ * while the offline CPU sits there is not necessarily safe
+ * on all CPU variants.
+ *
+ * Mark them in the offline_cpus mask which will be handled
+ * by CPU0 later in the update process.
+ *
+ * Ensure that the primary thread is online so that it is
+ * guaranteed that all cores are updated.
+ */
+ if (!cpu_online(cpu)) {
+ if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
+ pr_err("CPU %u not online, loading aborted\n", cpu);
+ return false;
+ }
+ cpumask_set_cpu(cpu, &cpu_offline_mask);
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ continue;
+ }
+
+ /*
+ * Initialize the per CPU state. This is core scope for now,
+ * but prepared to take package or system scope into account.
+ */
+ ctrl.ctrl_cpu = cpumask_first(topology_sibling_cpumask(cpu));
+ per_cpu(ucode_ctrl, cpu) = ctrl;
+ }
+ return true;
+}
+
+static int load_late_locked(void)
+{
+ if (!setup_cpus())
+ return -EBUSY;
+
+ switch (microcode_ops->request_microcode_fw(0, &microcode_fdev->dev)) {
+ case UCODE_NEW:
+ return load_late_stop_cpus(false);
+ case UCODE_NEW_SAFE:
+ return load_late_stop_cpus(true);
+ case UCODE_NFOUND:
+ return -ENOENT;
+ case UCODE_OK:
+ return 0;
+ default:
+ return -EBADFD;
+ }
+}
+
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t size)
+{
+ unsigned long val;
+ ssize_t ret;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret || val != 1)
+ return -EINVAL;
+
+ cpus_read_lock();
+ ret = load_late_locked();
+ cpus_read_unlock();
+
+ return ret ? : size;
+}
+
+static DEVICE_ATTR_WO(reload);
+#endif
+
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t processor_flags_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
+
+static struct attribute *mc_default_attrs[] = {
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
+ NULL
+};
+
+static const struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
+}
+
+/**
+ * microcode_bsp_resume - Update boot CPU microcode during resume.
+ */
+void microcode_bsp_resume(void)
+{
+ int cpu = smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (uci->mc)
+ microcode_ops->apply_microcode(cpu);
+ else
+ reload_early_microcode(cpu);
+}
+
+static void microcode_bsp_syscore_resume(void *data)
+{
+ microcode_bsp_resume();
+}
+
+static const struct syscore_ops mc_syscore_ops = {
+ .resume = microcode_bsp_syscore_resume,
+};
+
+static struct syscore mc_syscore = {
+ .ops = &mc_syscore_ops,
+};
+
+static int mc_cpu_online(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct device *dev = get_cpu_device(cpu);
+
+ memset(uci, 0, sizeof(*uci));
+
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
+
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev = get_cpu_device(cpu);
+
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static struct attribute *cpu_root_microcode_attrs[] = {
+#ifdef CONFIG_MICROCODE_LATE_LOADING
+ &dev_attr_reload.attr,
+#endif
+ NULL
+};
+
+static const struct attribute_group cpu_root_microcode_group = {
+ .name = "microcode",
+ .attrs = cpu_root_microcode_attrs,
+};
+
+static int __init microcode_init(void)
+{
+ struct device *dev_root;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ int error;
+
+ if (microcode_loader_disabled())
+ return -EINVAL;
+
+ if (c->x86_vendor == X86_VENDOR_INTEL)
+ microcode_ops = init_intel_microcode();
+ else if (c->x86_vendor == X86_VENDOR_AMD)
+ microcode_ops = init_amd_microcode();
+ else
+ pr_err("no support for this CPU vendor\n");
+
+ if (!microcode_ops)
+ return -ENODEV;
+
+ pr_info_once("Current revision: 0x%08x\n", (early_data.new_rev ?: early_data.old_rev));
+
+ if (early_data.new_rev)
+ pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev);
+
+ microcode_fdev = faux_device_create("microcode", NULL, NULL);
+ if (!microcode_fdev)
+ return -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group);
+ put_device(dev_root);
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_pdev;
+ }
+ }
+
+ register_syscore(&mc_syscore);
+ cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
+
+ return 0;
+
+ out_pdev:
+ faux_device_destroy(microcode_fdev);
+ return error;
+
+}
+late_initcall(microcode_init);
diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
new file mode 100644
index 000000000000..2d48e6593540
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h
@@ -0,0 +1,160 @@
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 },
+{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 },
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
new file mode 100644
index 000000000000..8744f3adc2a0
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -0,0 +1,1016 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Intel CPU Microcode Update Driver for Linux
+ *
+ * Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
+ * 2006 Shaohua Li <shaohua.li@intel.com>
+ *
+ * Intel CPU microcode early update for Linux
+ *
+ * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
+ * H Peter Anvin" <hpa@zytor.com>
+ */
+#define pr_fmt(fmt) "microcode: " fmt
+#include <linux/earlycpio.h>
+#include <linux/firmware.h>
+#include <linux/pci_ids.h>
+#include <linux/uaccess.h>
+#include <linux/initrd.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
+
+#define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL)
+
+/* Defines for the microcode staging mailbox interface */
+#define MBOX_REG_NUM 4
+#define MBOX_REG_SIZE sizeof(u32)
+
+#define MBOX_CONTROL_OFFSET 0x0
+#define MBOX_STATUS_OFFSET 0x4
+#define MBOX_WRDATA_OFFSET 0x8
+#define MBOX_RDDATA_OFFSET 0xc
+
+#define MASK_MBOX_CTRL_ABORT BIT(0)
+#define MASK_MBOX_CTRL_GO BIT(31)
+
+#define MASK_MBOX_STATUS_ERROR BIT(2)
+#define MASK_MBOX_STATUS_READY BIT(31)
+
+#define MASK_MBOX_RESP_SUCCESS BIT(0)
+#define MASK_MBOX_RESP_PROGRESS BIT(1)
+#define MASK_MBOX_RESP_ERROR BIT(2)
+
+#define MBOX_CMD_LOAD 0x3
+#define MBOX_OBJ_STAGING 0xb
+#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \
+ (MBOX_OBJ_STAGING << 16) | \
+ ((u64)((size) / sizeof(u32)) << 32))
+
+/* The size of each mailbox header */
+#define MBOX_HEADER_SIZE sizeof(u64)
+/* The size of staging hardware response */
+#define MBOX_RESPONSE_SIZE sizeof(u64)
+
+#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC)
+
+/* Current microcode patch used in early patching on the APs. */
+static struct microcode_intel *ucode_patch_va __read_mostly;
+static struct microcode_intel *ucode_patch_late __read_mostly;
+
+/* last level cache size per core */
+static unsigned int llc_size_per_core __ro_after_init;
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+ unsigned int sig;
+ unsigned int pf;
+ unsigned int cksum;
+};
+
+struct extended_sigtable {
+ unsigned int count;
+ unsigned int cksum;
+ unsigned int reserved[3];
+ struct extended_signature sigs[];
+};
+
+/**
+ * struct staging_state - Track the current staging process state
+ *
+ * @mmio_base: MMIO base address for staging
+ * @ucode_len: Total size of the microcode image
+ * @chunk_size: Size of each data piece
+ * @bytes_sent: Total bytes transmitted so far
+ * @offset: Current offset in the microcode image
+ */
+struct staging_state {
+ void __iomem *mmio_base;
+ unsigned int ucode_len;
+ unsigned int chunk_size;
+ unsigned int bytes_sent;
+ unsigned int offset;
+};
+
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+
+static inline unsigned int get_totalsize(struct microcode_header_intel *hdr)
+{
+ return hdr->datasize ? hdr->totalsize : DEFAULT_UCODE_TOTALSIZE;
+}
+
+static inline unsigned int exttable_size(struct extended_sigtable *et)
+{
+ return et->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE;
+}
+
+void intel_collect_cpu_info(struct cpu_signature *sig)
+{
+ sig->sig = cpuid_eax(1);
+ sig->pf = 0;
+ sig->rev = intel_get_microcode_revision();
+
+ if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) {
+ unsigned int val[2];
+
+ /* get processor flags from MSR 0x17 */
+ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ sig->pf = 1 << ((val[1] >> 18) & 7);
+ }
+}
+EXPORT_SYMBOL_GPL(intel_collect_cpu_info);
+
+static inline bool cpu_signatures_match(struct cpu_signature *s1, unsigned int sig2,
+ unsigned int pf2)
+{
+ if (s1->sig != sig2)
+ return false;
+
+ /* Processor flags are either both 0 or they intersect. */
+ return ((!s1->pf && !pf2) || (s1->pf & pf2));
+}
+
+bool intel_find_matching_signature(void *mc, struct cpu_signature *sig)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_signature *ext_sig;
+ struct extended_sigtable *ext_hdr;
+ int i;
+
+ if (cpu_signatures_match(sig, mc_hdr->sig, mc_hdr->pf))
+ return true;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return false;
+
+ ext_hdr = mc + intel_microcode_get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(sig, ext_sig->sig, ext_sig->pf))
+ return true;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = intel_microcode_get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
+static void update_ucode_pointer(struct microcode_intel *mc)
+{
+ kvfree(ucode_patch_va);
+
+ /*
+ * Save the virtual address for early loading and for eventual free
+ * on late loading.
+ */
+ ucode_patch_va = mc;
+}
+
+static void save_microcode_patch(struct microcode_intel *patch)
+{
+ unsigned int size = get_totalsize(&patch->hdr);
+ struct microcode_intel *mc;
+
+ mc = kvmemdup(patch, size, GFP_KERNEL);
+ if (mc)
+ update_ucode_pointer(mc);
+ else
+ pr_err("Unable to allocate microcode memory size: %u\n", size);
+}
+
+/* Scan blob for microcode matching the boot CPUs family, model, stepping */
+static __init struct microcode_intel *scan_microcode(void *data, size_t size,
+ struct ucode_cpu_info *uci,
+ bool save)
+{
+ struct microcode_header_intel *mc_header;
+ struct microcode_intel *patch = NULL;
+ u32 cur_rev = uci->cpu_sig.rev;
+ unsigned int mc_size;
+
+ for (; size >= sizeof(struct microcode_header_intel); size -= mc_size, data += mc_size) {
+ mc_header = (struct microcode_header_intel *)data;
+
+ mc_size = get_totalsize(mc_header);
+ if (!mc_size || mc_size > size ||
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
+ break;
+
+ if (!intel_find_matching_signature(data, &uci->cpu_sig))
+ continue;
+
+ /*
+ * For saving the early microcode, find the matching revision which
+ * was loaded on the BSP.
+ *
+ * On the BSP during early boot, find a newer revision than
+ * actually loaded in the CPU.
+ */
+ if (save) {
+ if (cur_rev != mc_header->rev)
+ continue;
+ } else if (cur_rev >= mc_header->rev) {
+ continue;
+ }
+
+ patch = data;
+ cur_rev = mc_header->rev;
+ }
+
+ return size ? NULL : patch;
+}
+
+static inline u32 read_mbox_dword(void __iomem *mmio_base)
+{
+ u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET);
+
+ /* Acknowledge read completion to the staging hardware */
+ writel(0, mmio_base + MBOX_RDDATA_OFFSET);
+ return dword;
+}
+
+static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword)
+{
+ writel(dword, mmio_base + MBOX_WRDATA_OFFSET);
+}
+
+static inline u64 read_mbox_header(void __iomem *mmio_base)
+{
+ u32 high, low;
+
+ low = read_mbox_dword(mmio_base);
+ high = read_mbox_dword(mmio_base);
+
+ return ((u64)high << 32) | low;
+}
+
+static inline void write_mbox_header(void __iomem *mmio_base, u64 value)
+{
+ write_mbox_dword(mmio_base, value);
+ write_mbox_dword(mmio_base, value >> 32);
+}
+
+static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes)
+{
+ int i;
+
+ /*
+ * The MMIO space is mapped as Uncached (UC). Each write arrives
+ * at the device as an individual transaction in program order.
+ * The device can then reassemble the sequence accordingly.
+ */
+ for (i = 0; i < chunk_bytes / sizeof(u32); i++)
+ write_mbox_dword(mmio_base, chunk[i]);
+}
+
+/*
+ * Prepare for a new microcode transfer: reset hardware and record the
+ * image size.
+ */
+static void init_stage(struct staging_state *ss)
+{
+ ss->ucode_len = get_totalsize(&ucode_patch_late->hdr);
+
+ /*
+ * Abort any ongoing process, effectively resetting the device.
+ * Unlike regular mailbox data processing requests, this
+ * operation does not require a status check.
+ */
+ writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET);
+}
+
+/*
+ * Update the chunk size and decide whether another chunk can be sent.
+ * This accounts for remaining data and retry limits.
+ */
+static bool can_send_next_chunk(struct staging_state *ss, int *err)
+{
+ /* A page size or remaining bytes if this is the final chunk */
+ ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset);
+
+ /*
+ * Each microcode image is divided into chunks, each at most
+ * one page size. A 10-chunk image would typically require 10
+ * transactions.
+ *
+ * However, the hardware managing the mailbox has limited
+ * resources and may not cache the entire image, potentially
+ * requesting the same chunk multiple times.
+ *
+ * To tolerate this behavior, allow up to twice the expected
+ * number of transactions (i.e., a 10-chunk image can take up to
+ * 20 attempts).
+ *
+ * If the number of attempts exceeds this limit, treat it as
+ * exceeding the maximum allowed transfer size.
+ */
+ if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) {
+ *err = -EMSGSIZE;
+ return false;
+ }
+
+ *err = 0;
+ return true;
+}
+
+/*
+ * The hardware indicates completion by returning a sentinel end offset.
+ */
+static inline bool is_end_offset(u32 offset)
+{
+ return offset == UINT_MAX;
+}
+
+/*
+ * Determine whether staging is complete: either the hardware signaled
+ * the end offset, or no more transactions are permitted (retry limit
+ * reached).
+ */
+static inline bool staging_is_complete(struct staging_state *ss, int *err)
+{
+ return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err);
+}
+
+/*
+ * Wait for the hardware to complete a transaction.
+ * Return 0 on success, or an error code on failure.
+ */
+static int wait_for_transaction(struct staging_state *ss)
+{
+ u32 timeout, status;
+
+ /* Allow time for hardware to complete the operation: */
+ for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) {
+ msleep(1);
+
+ status = readl(ss->mmio_base + MBOX_STATUS_OFFSET);
+ /* Break out early if the hardware is ready: */
+ if (status & MASK_MBOX_STATUS_READY)
+ break;
+ }
+
+ /* Check for explicit error response */
+ if (status & MASK_MBOX_STATUS_ERROR)
+ return -EIO;
+
+ /*
+ * Hardware has neither responded to the action nor signaled any
+ * error. Treat this as a timeout.
+ */
+ if (!(status & MASK_MBOX_STATUS_READY))
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+/*
+ * Transmit a chunk of the microcode image to the hardware.
+ * Return 0 on success, or an error code on failure.
+ */
+static int send_data_chunk(struct staging_state *ss, void *ucode_ptr)
+{
+ u32 *src_chunk = ucode_ptr + ss->offset;
+ u16 mbox_size;
+
+ /*
+ * Write a 'request' mailbox object in this order:
+ * 1. Mailbox header includes total size
+ * 2. Command header specifies the load operation
+ * 3. Data section contains a microcode chunk
+ *
+ * Thus, the mailbox size is two headers plus the chunk size.
+ */
+ mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size;
+ write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size));
+ write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD);
+ write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size);
+ ss->bytes_sent += ss->chunk_size;
+
+ /* Notify the hardware that the mailbox is ready for processing. */
+ writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET);
+
+ return wait_for_transaction(ss);
+}
+
+/*
+ * Retrieve the next offset from the hardware response.
+ * Return 0 on success, or an error code on failure.
+ */
+static int fetch_next_offset(struct staging_state *ss)
+{
+ const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE);
+ u32 offset, status;
+ u64 header;
+
+ /*
+ * The 'response' mailbox returns three fields, in order:
+ * 1. Header
+ * 2. Next offset in the microcode image
+ * 3. Status flags
+ */
+ header = read_mbox_header(ss->mmio_base);
+ offset = read_mbox_dword(ss->mmio_base);
+ status = read_mbox_dword(ss->mmio_base);
+
+ /* All valid responses must start with the expected header. */
+ if (header != expected_header) {
+ pr_err_once("staging: invalid response header (0x%llx)\n", header);
+ return -EBADR;
+ }
+
+ /*
+ * Verify the offset: If not at the end marker, it must not
+ * exceed the microcode image length.
+ */
+ if (!is_end_offset(offset) && offset > ss->ucode_len) {
+ pr_err_once("staging: invalid offset (%u) past the image end (%u)\n",
+ offset, ss->ucode_len);
+ return -EINVAL;
+ }
+
+ /* Hardware may report errors explicitly in the status field */
+ if (status & MASK_MBOX_RESP_ERROR)
+ return -EPROTO;
+
+ ss->offset = offset;
+ return 0;
+}
+
+/*
+ * Handle the staging process using the mailbox MMIO interface. The
+ * microcode image is transferred in chunks until completion.
+ * Return 0 on success or an error code on failure.
+ */
+static int do_stage(u64 mmio_pa)
+{
+ struct staging_state ss = {};
+ int err;
+
+ ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE);
+ if (WARN_ON_ONCE(!ss.mmio_base))
+ return -EADDRNOTAVAIL;
+
+ init_stage(&ss);
+
+ /* Perform the staging process while within the retry limit */
+ while (!staging_is_complete(&ss, &err)) {
+ /* Send a chunk of microcode each time: */
+ err = send_data_chunk(&ss, ucode_patch_late);
+ if (err)
+ break;
+ /*
+ * Then, ask the hardware which piece of the image it
+ * needs next. The same piece may be sent more than once.
+ */
+ err = fetch_next_offset(&ss);
+ if (err)
+ break;
+ }
+
+ iounmap(ss.mmio_base);
+
+ return err;
+}
+
+static void stage_microcode(void)
+{
+ unsigned int pkg_id = UINT_MAX;
+ int cpu, err;
+ u64 mmio_pa;
+
+ if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) {
+ pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n",
+ get_totalsize(&ucode_patch_late->hdr));
+ return;
+ }
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * The MMIO address is unique per package, and all the SMT
+ * primary threads are online here. Find each MMIO space by
+ * their package IDs to avoid duplicate staging.
+ */
+ for_each_cpu(cpu, cpu_primary_thread_mask) {
+ if (topology_logical_package_id(cpu) == pkg_id)
+ continue;
+
+ pkg_id = topology_logical_package_id(cpu);
+
+ err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ err = do_stage(mmio_pa);
+ if (err) {
+ pr_err("Error: staging failed (%d) for CPU%d at package %u.\n",
+ err, cpu, pkg_id);
+ return;
+ }
+ }
+
+ pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev);
+}
+
+static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
+ struct microcode_intel *mc,
+ u32 *cur_rev)
+{
+ u32 rev;
+
+ if (!mc)
+ return UCODE_NFOUND;
+
+ /*
+ * Save us the MSR write below - which is a particular expensive
+ * operation - when the other hyperthread has updated the microcode
+ * already.
+ */
+ *cur_rev = intel_get_microcode_revision();
+ if (*cur_rev >= mc->hdr.rev) {
+ uci->cpu_sig.rev = *cur_rev;
+ return UCODE_OK;
+ }
+
+ /* write microcode via MSR 0x79 */
+ native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
+ return UCODE_ERROR;
+
+ uci->cpu_sig.rev = rev;
+ return UCODE_UPDATED;
+}
+
+static enum ucode_state apply_microcode_early(struct ucode_cpu_info *uci)
+{
+ struct microcode_intel *mc = uci->mc;
+ u32 cur_rev;
+
+ return __apply_microcode(uci, mc, &cur_rev);
+}
+
+static __init bool load_builtin_intel_microcode(struct cpio_data *cp)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ struct firmware fw;
+ char name[30];
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ x86_family(eax), x86_model(eax), x86_stepping(eax));
+
+ if (firmware_request_builtin(&fw, name)) {
+ cp->size = fw.size;
+ cp->data = (void *)fw.data;
+ return true;
+ }
+ return false;
+}
+
+static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *uci, bool save)
+{
+ struct cpio_data cp;
+
+ intel_collect_cpu_info(&uci->cpu_sig);
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path);
+
+ if (!(cp.data && cp.size))
+ return NULL;
+
+ return scan_microcode(cp.data, cp.size, uci, save);
+}
+
+/*
+ * Invoked from an early init call to save the microcode blob which was
+ * selected during early boot when mm was not usable. The microcode must be
+ * saved because initrd is going away. It's an early init call so the APs
+ * just can use the pointer and do not have to scan initrd/builtin firmware
+ * again.
+ */
+static int __init save_builtin_microcode(void)
+{
+ struct ucode_cpu_info uci;
+
+ if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED)
+ return 0;
+
+ if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ uci.mc = get_microcode_blob(&uci, true);
+ if (uci.mc)
+ save_microcode_patch(uci.mc);
+ return 0;
+}
+early_initcall(save_builtin_microcode);
+
+/* Load microcode on BSP from initrd or builtin blobs */
+void __init load_ucode_intel_bsp(struct early_load_data *ed)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = get_microcode_blob(&uci, false);
+ ed->old_rev = uci.cpu_sig.rev;
+
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
+ }
+}
+
+void load_ucode_intel_ap(void)
+{
+ struct ucode_cpu_info uci;
+
+ uci.mc = ucode_patch_va;
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+/* Reload microcode on resume */
+void reload_ucode_intel(void)
+{
+ struct ucode_cpu_info uci = { .mc = ucode_patch_va, };
+
+ if (uci.mc)
+ apply_microcode_early(&uci);
+}
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+ intel_collect_cpu_info(csig);
+ return 0;
+}
+
+static enum ucode_state apply_microcode_late(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct microcode_intel *mc = ucode_patch_late;
+ enum ucode_state ret;
+ u32 cur_rev;
+
+ if (WARN_ON_ONCE(smp_processor_id() != cpu))
+ return UCODE_ERROR;
+
+ ret = __apply_microcode(uci, mc, &cur_rev);
+ if (ret != UCODE_UPDATED && ret != UCODE_OK)
+ return ret;
+
+ cpu_data(cpu).microcode = uci->cpu_sig.rev;
+ if (!cpu)
+ boot_cpu_data.microcode = uci->cpu_sig.rev;
+
+ return ret;
+}
+
+static bool ucode_validate_minrev(struct microcode_header_intel *mc_header)
+{
+ int cur_rev = boot_cpu_data.microcode;
+
+ /*
+ * When late-loading, ensure the header declares a minimum revision
+ * required to perform a late-load. The previously reserved field
+ * is 0 in older microcode blobs.
+ */
+ if (!mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Microcode header does not specify a required min version\n");
+ return false;
+ }
+
+ /*
+ * Check whether the current revision is either greater or equal to
+ * to the minimum revision specified in the header.
+ */
+ if (cur_rev < mc_header->min_req_ver) {
+ pr_info("Unsafe microcode update: Current revision 0x%x too old\n", cur_rev);
+ pr_info("Current should be at 0x%x or higher. Use early loading instead\n", mc_header->min_req_ver);
+ return false;
+ }
+ return true;
+}
+
+static enum ucode_state parse_microcode_blobs(int cpu, struct iov_iter *iter)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ bool is_safe, new_is_safe = false;
+ int cur_rev = uci->cpu_sig.rev;
+ unsigned int curr_mc_size = 0;
+ u8 *new_mc = NULL, *mc = NULL;
+
+ while (iov_iter_count(iter)) {
+ struct microcode_header_intel mc_header;
+ unsigned int mc_size, data_size;
+ u8 *data;
+
+ if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) {
+ pr_err("error! Truncated or inaccessible header in microcode data file\n");
+ goto fail;
+ }
+
+ mc_size = get_totalsize(&mc_header);
+ if (mc_size < sizeof(mc_header)) {
+ pr_err("error! Bad data in microcode data file (totalsize too small)\n");
+ goto fail;
+ }
+ data_size = mc_size - sizeof(mc_header);
+ if (data_size > iov_iter_count(iter)) {
+ pr_err("error! Bad data in microcode data file (truncated file?)\n");
+ goto fail;
+ }
+
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ kvfree(mc);
+ mc = kvmalloc(mc_size, GFP_KERNEL);
+ if (!mc)
+ goto fail;
+ curr_mc_size = mc_size;
+ }
+
+ memcpy(mc, &mc_header, sizeof(mc_header));
+ data = mc + sizeof(mc_header);
+ if (!copy_from_iter_full(data, data_size, iter) ||
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0)
+ goto fail;
+
+ if (cur_rev >= mc_header.rev)
+ continue;
+
+ if (!intel_find_matching_signature(mc, &uci->cpu_sig))
+ continue;
+
+ is_safe = ucode_validate_minrev(&mc_header);
+ if (force_minrev && !is_safe)
+ continue;
+
+ kvfree(new_mc);
+ cur_rev = mc_header.rev;
+ new_mc = mc;
+ new_is_safe = is_safe;
+ mc = NULL;
+ }
+
+ if (iov_iter_count(iter))
+ goto fail;
+
+ kvfree(mc);
+ if (!new_mc)
+ return UCODE_NFOUND;
+
+ ucode_patch_late = (struct microcode_intel *)new_mc;
+ return new_is_safe ? UCODE_NEW_SAFE : UCODE_NEW;
+
+fail:
+ kvfree(mc);
+ kvfree(new_mc);
+ return UCODE_ERROR;
+}
+
+static bool is_blacklisted(unsigned int cpu)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Late loading on model 79 with microcode revision less than 0x0b000021
+ * and LLC size per core bigger than 2.5MB may result in a system hang.
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
+ * Processor E7-8800/4800 v4 Product Family).
+ */
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
+ c->x86_stepping == 0x01 &&
+ llc_size_per_core > 2621440 &&
+ c->microcode < 0x0b000021) {
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+ return true;
+ }
+
+ return false;
+}
+
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ const struct firmware *firmware;
+ struct iov_iter iter;
+ enum ucode_state ret;
+ struct kvec kvec;
+ char name[30];
+
+ if (is_blacklisted(cpu))
+ return UCODE_NFOUND;
+
+ sprintf(name, "intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_stepping);
+
+ if (request_firmware_direct(&firmware, name, device)) {
+ pr_debug("data file %s load failed\n", name);
+ return UCODE_NFOUND;
+ }
+
+ kvec.iov_base = (void *)firmware->data;
+ kvec.iov_len = firmware->size;
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
+ ret = parse_microcode_blobs(cpu, &iter);
+
+ release_firmware(firmware);
+
+ return ret;
+}
+
+static void finalize_late_load(int result)
+{
+ if (!result)
+ update_ucode_pointer(ucode_patch_late);
+ else
+ kvfree(ucode_patch_late);
+ ucode_patch_late = NULL;
+}
+
+static struct microcode_ops microcode_intel_ops = {
+ .request_microcode_fw = request_microcode_fw,
+ .collect_cpu_info = collect_cpu_info,
+ .apply_microcode = apply_microcode_late,
+ .finalize_late_load = finalize_late_load,
+ .stage_microcode = stage_microcode,
+ .use_nmi = IS_ENABLED(CONFIG_X86_64),
+};
+
+static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+ u64 llc_size = c->x86_cache_size * 1024ULL;
+
+ do_div(llc_size, topology_num_cores_per_package());
+ llc_size_per_core = (unsigned int)llc_size;
+}
+
+static __init bool staging_available(void)
+{
+ u64 val;
+
+ val = x86_read_arch_cap_msr();
+ if (!(val & ARCH_CAP_MCU_ENUM))
+ return false;
+
+ rdmsrq(MSR_IA32_MCU_ENUMERATION, val);
+ return !!(val & MCU_STAGING);
+}
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+ cpu_has(c, X86_FEATURE_IA64)) {
+ pr_err("Intel CPU family 0x%x not supported\n", c->x86);
+ return NULL;
+ }
+
+ if (staging_available()) {
+ microcode_intel_ops.use_staging = true;
+ pr_info("Enabled staging feature.\n");
+ }
+
+ calc_llc_size_per_core(c);
+
+ return &microcode_intel_ops;
+}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
new file mode 100644
index 000000000000..a10b547eda1e
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_MICROCODE_INTERNAL_H
+#define _X86_MICROCODE_INTERNAL_H
+
+#include <linux/earlycpio.h>
+#include <linux/initrd.h>
+
+#include <asm/cpu.h>
+#include <asm/microcode.h>
+
+struct device;
+
+enum ucode_state {
+ UCODE_OK = 0,
+ UCODE_NEW,
+ UCODE_NEW_SAFE,
+ UCODE_UPDATED,
+ UCODE_NFOUND,
+ UCODE_ERROR,
+ UCODE_TIMEOUT,
+ UCODE_OFFLINE,
+};
+
+struct microcode_ops {
+ enum ucode_state (*request_microcode_fw)(int cpu, struct device *dev);
+ void (*microcode_fini_cpu)(int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that the callbacks
+ * below run on a target CPU when they are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ enum ucode_state (*apply_microcode)(int cpu);
+ void (*stage_microcode)(void);
+ int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
+ void (*finalize_late_load)(int result);
+ unsigned int nmi_safe : 1,
+ use_nmi : 1,
+ use_staging : 1;
+};
+
+struct early_load_data {
+ u32 old_rev;
+ u32 new_rev;
+};
+
+extern struct early_load_data early_data;
+extern struct ucode_cpu_info ucode_cpu_info[];
+extern u32 microcode_rev[NR_CPUS];
+extern u32 base_rev;
+
+struct cpio_data find_microcode_in_initrd(const char *path);
+
+#define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx) \
+ (!(((ebx) ^ (a)) | ((edx) ^ (b)) | ((ecx) ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_cpuid_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_cpuid_vendor() to get vendor id for AP.
+ *
+ * x86_cpuid_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_cpuid_vendor(void)
+{
+ u32 eax = 0x00000000;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+ return X86_VENDOR_INTEL;
+
+ if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+ return X86_VENDOR_AMD;
+
+ return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int x86_cpuid_family(void)
+{
+ u32 eax = 0x00000001;
+ u32 ebx, ecx = 0, edx;
+
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+
+ return x86_family(eax);
+}
+
+extern bool force_minrev;
+
+#ifdef CONFIG_CPU_SUP_AMD
+void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
+void load_ucode_amd_ap(unsigned int family);
+void reload_ucode_amd(unsigned int cpu);
+struct microcode_ops *init_amd_microcode(void);
+void exit_amd_microcode(void);
+#else /* CONFIG_CPU_SUP_AMD */
+static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
+static inline void load_ucode_amd_ap(unsigned int family) { }
+static inline void reload_ucode_amd(unsigned int cpu) { }
+static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
+static inline void exit_amd_microcode(void) { }
+#endif /* !CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+void load_ucode_intel_bsp(struct early_load_data *ed);
+void load_ucode_intel_ap(void);
+void reload_ucode_intel(void);
+struct microcode_ops *init_intel_microcode(void);
+#else /* CONFIG_CPU_SUP_INTEL */
+static inline void load_ucode_intel_bsp(struct early_load_data *ed) { }
+static inline void load_ucode_intel_ap(void) { }
+static inline void reload_ucode_intel(void) { }
+static inline struct microcode_ops *init_intel_microcode(void) { return NULL; }
+#endif /* !CONFIG_CPU_SUP_INTEL */
+
+#define ucode_dbg(fmt, ...) \
+({ \
+ if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+})
+
+#endif /* _X86_MICROCODE_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 2bf616505499..68f537347466 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,23 +1,28 @@
#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
#
-# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
#
-IN=$1
-OUT=$2
+set -e
-TABS="$(printf '\t\t\t\t\t')"
-trap 'rm "$OUT"' EXIT
+OUT=$1
-(
- echo "#ifndef _ASM_X86_CPUFEATURE_H"
- echo "#include <asm/cpufeature.h>"
- echo "#endif"
- echo ""
- echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
+ IN=$5
+
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
- # Iterate through any input lines starting with #define X86_FEATURE_
- sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ echo "const char * const $ARRAY[$SIZE] = {"
+
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
@@ -25,17 +30,44 @@ trap 'rm "$OUT"' EXIT
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
- [ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" == '""' ] && continue
+ [ ! "$VALUE" ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
- TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
- printf "\t[%s]%.*s = %s,\n" \
- "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
done
echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURES_H"
+ echo "#include <asm/cpufeatures.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f4be53ea04b..579fb2c64cfd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -1,123 +1,771 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* HyperV Detection code.
*
* Copyright (C) 2010, Novell, Inc.
* Author : K. Y. Srinivasan <ksrinivasan@novell.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- *
*/
#include <linux/types.h>
#include <linux/time.h>
#include <linux/clocksource.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
#include <linux/hardirq.h>
+#include <linux/efi.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kexec.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
-#include <asm/idle.h>
+#include <asm/idtentry.h>
#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
+#include <asm/reboot.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+#include <clocksource/hyperv_timer.h>
+#include <asm/numa.h>
+#include <asm/svm.h>
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
struct ms_hyperv_info ms_hyperv;
-EXPORT_SYMBOL_GPL(ms_hyperv);
-static bool __init ms_hyperv_platform(void)
+#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
+{
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_msr(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
+ else
+ rdmsrq(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
+
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ /* The hypervisor will get the intercept. */
+ hv_ivm_msr_write(reg, value);
+
+ /* Using wrmsrq so the following goes to the paravisor. */
+ if (hv_is_sint_msr(reg)) {
+ union hv_synic_sint sint = { .as_uint64 = value };
+
+ sint.proxy = hv_para_sint_proxy;
+ native_wrmsrq(reg, sint.as_uint64);
+ }
+ } else {
+ native_wrmsrq(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+ hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return ~0ULL;
+ return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return;
+ native_write_msr(reg, val);
+}
+
+u64 hv_get_msr(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ return hv_get_non_nested_msr(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_msr);
+
+void hv_set_msr(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_msr(reg);
+
+ hv_set_non_nested_msr(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_msr);
+
+static void (*mshv_handler)(void);
+static void (*vmbus_handler)(void);
+static void (*hv_stimer0_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+ if (mshv_handler)
+ mshv_handler();
+
+ if (vmbus_handler)
+ vmbus_handler();
+
+ if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_mshv_handler(void (*handler)(void))
+{
+ mshv_handler = handler;
+}
+
+void hv_setup_vmbus_handler(void (*handler)(void))
+{
+ vmbus_handler = handler;
+}
+
+void hv_remove_vmbus_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+
+/*
+ * Routines to do per-architecture handling of stimer0
+ * interrupts when in Direct Mode
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(hyperv_stimer0_count);
+ if (hv_stimer0_handler)
+ hv_stimer0_handler();
+ add_interrupt_randomness(HYPERV_STIMER0_VECTOR);
+ apic_eoi();
+
+ set_irq_regs(old_regs);
+}
+
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
+{
+ hv_stimer0_handler = handler;
+}
+
+void hv_remove_stimer0_handler(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ hv_stimer0_handler = NULL;
+}
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+
+ /*
+ * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
+ * corrupts the old VP Assist Pages and can crash the kexec kernel.
+ */
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
+
+ /* The function calls stop_other_cpus(). */
+ native_machine_shutdown();
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ if (kexec_in_progress)
+ hyperv_cleanup();
+}
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_DUMP
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+
+ /* The function calls crash_smp_send_stop(). */
+ native_machine_crash_shutdown(regs);
+
+ /* Disable the hypercall page when there is only 1 active CPU. */
+ hyperv_cleanup();
+}
+#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
+
+#ifdef CONFIG_X86_64
+DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
+EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall);
+#define hypercall_update(hc) static_call_update(hv_hypercall, hc)
+#endif
+#endif /* CONFIG_HYPERV */
+
+#ifndef hypercall_update
+#define hypercall_update(hc) (void)hc
+#endif
+
+static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
u32 hyp_signature[3];
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return false;
+ return 0;
cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
&eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
- return eax >= HYPERV_CPUID_MIN &&
- eax <= HYPERV_CPUID_MAX &&
- !memcmp("Microsoft Hv", hyp_signature, 12);
+ if (eax < HYPERV_CPUID_MIN || eax > HYPERV_CPUID_MAX ||
+ memcmp("Microsoft Hv", hyp_signature, 12))
+ return 0;
+
+ /* HYPERCALL and VP_INDEX MSRs are mandatory for all features. */
+ eax = cpuid_eax(HYPERV_CPUID_FEATURES);
+ if (!(eax & HV_MSR_HYPERCALL_AVAILABLE)) {
+ pr_warn("x86/hyperv: HYPERCALL MSR not available.\n");
+ return 0;
+ }
+ if (!(eax & HV_MSR_VP_INDEX_AVAILABLE)) {
+ pr_warn("x86/hyperv: VP_INDEX MSR not available.\n");
+ return 0;
+ }
+
+ return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+ unsigned int old_cpu, this_cpu;
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ old_cpu = -1;
+ this_cpu = raw_smp_processor_id();
+ if (!atomic_try_cmpxchg(&nmi_cpu, &old_cpu, this_cpu))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
+static unsigned long hv_get_tsc_khz(void)
+{
+ unsigned long freq;
+
+ rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq);
+
+ return freq / 1000;
+}
+
+#if defined(CONFIG_SMP) && IS_ENABLED(CONFIG_HYPERV)
+static void __init hv_smp_prepare_boot_cpu(void)
+{
+ native_smp_prepare_boot_cpu();
+#if defined(CONFIG_X86_64) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ hv_init_spinlocks();
+#endif
}
-static cycle_t read_hv_clock(struct clocksource *arg)
+static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
{
- cycle_t current_tick;
+#ifdef CONFIG_X86_64
+ int i;
+ int ret;
+#endif
+
+ native_smp_prepare_cpus(max_cpus);
+
/*
- * Read the partition counter to get the current tick count. This count
- * is set to 0 when the partition is created and is incremented in
- * 100 nanosecond units.
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
*/
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
- return current_tick;
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
+#ifdef CONFIG_X86_64
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
+ BUG_ON(ret);
+ }
+
+ for_each_present_cpu(i) {
+ if (i == 0)
+ continue;
+ ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
+ BUG_ON(ret);
+ }
+#endif
}
+#endif
-static struct clocksource hyperv_cs = {
- .name = "hyperv_clocksource",
- .rating = 400, /* use this when running on Hyperv*/
- .read = read_hv_clock,
- .mask = CLOCKSOURCE_MASK(64),
-};
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
+ int hv_max_functions_eax, eax;
+
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
/*
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints);
+ hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
- if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
- clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
-}
+ pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n",
+ ms_hyperv.features, ms_hyperv.priv_high,
+ ms_hyperv.ext_features, ms_hyperv.hints,
+ ms_hyperv.misc_features);
-const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
- .name = "Microsoft HyperV",
- .detect = ms_hyperv_platform,
- .init_platform = ms_hyperv_init_platform,
-};
-EXPORT_SYMBOL(x86_hyper_ms_hyperv);
+ ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
+ ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
+
+ pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
+ ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
+
+ hv_identify_partition_type();
+
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
+ }
+
+ /*
+ * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+ * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+ * root) will not get them because the nested (L1) hypervisor filters them out.
+ * These are handled through intercept processing by the Windows Hyper-V stack
+ * or the paravisor.
+ */
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ ms_hyperv.confidential_vmbus_available =
+ eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+ ms_hyperv.msi_ext_dest_id =
+ eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ x86_platform.calibrate_tsc = hv_get_tsc_khz;
+ x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ }
+
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
+ ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
+ ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
+ ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
+
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ static_branch_enable(&isolation_type_snp);
+ if (!ms_hyperv.paravisor_present)
+ hypercall_update(hv_snp_hypercall);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ hypercall_update(hv_tdx_hypercall);
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
+ /* HV_MSR_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
+ }
+
+ if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
+ ms_hyperv.nested_features =
+ cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
+ pr_info("Hyper-V: Nested features: 0x%x\n",
+ ms_hyperv.nested_features);
+ }
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
+ ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_period = hv_lapic_frequency;
+ pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_period);
+ }
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
#if IS_ENABLED(CONFIG_HYPERV)
-static int vmbus_irq = -1;
-static irq_handler_t vmbus_isr;
+ if (hv_root_partition())
+ machine_ops.power_off = hv_machine_power_off;
+#if defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
+ if (!hv_root_partition())
+ machine_ops.crash_shutdown = hv_guest_crash_shutdown;
+#endif
+#endif
+ /*
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root
+ * partition doesn't need to write to synthetic MSR to enable invariant
+ * TSC feature. It sees what the hardware provides.
+ */
+ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ /*
+ * Writing to synthetic MSR 0x40000118 updates/changes the
+ * guest visible CPUIDs. Setting bit 0 of this MSR enables
+ * guests to report invariant TSC feature through CPUID
+ * instruction, CPUID 0x800000007/EDX, bit 8. See code in
+ * early_init_intel() where this bit is examined. The
+ * setting of this MSR bit should happen before init_intel()
+ * is called.
+ */
+ wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ }
-void hv_register_vmbus_handler(int irq, irq_handler_t handler)
-{
/*
- * Setup the IDT for hypervisor callback.
+ * Generation 2 instances don't support reading the NMI status from
+ * 0x61 port.
*/
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
+ if (efi_enabled(EFI_BOOT))
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
- vmbus_irq = irq;
- vmbus_isr = handler;
-}
+#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ ms_hyperv.paravisor_present)
+ hv_vtom_init();
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+ hyperv_setup_mmu_ops();
-void hyperv_vector_handler(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
+ /* Install system interrupt handler for hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
- irq_enter();
- exit_idle();
+ /* Install system interrupt handler for reenlightenment notifications */
+ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
+ sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
+ }
- desc = irq_to_desc(vmbus_irq);
+ /* Install system interrupt handler for stimer0 */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
+ sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
+ }
- if (desc)
- generic_handle_irq_desc(vmbus_irq, desc);
+# ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
+ if (hv_root_partition() ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
+ smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
+# endif
- irq_exit();
- set_irq_regs(old_regs);
+ /*
+ * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
+ * set x2apic destination mode to physical mode when x2apic is available
+ * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
+ * have 8-bit APIC id.
+ */
+# ifdef CONFIG_X86_X2APIC
+ if (x2apic_supported())
+ x2apic_phys = 1;
+# endif
+
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
+ hv_vtl_init_platform();
+#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ *
+ * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No
+ * need to check for it.
+ */
+ if (!hv_root_partition() &&
+ !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
+
+ hardlockup_detector_disable();
}
-#else
-void hv_register_vmbus_handler(int irq, irq_handler_t handler)
+
+static bool __init ms_hyperv_x2apic_available(void)
{
+ return x2apic_supported();
+}
+
+/*
+ * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
+ * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
+ * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
+ *
+ * Note: for a VM on Hyper-V, the I/O-APIC is the only device which
+ * (logically) generates MSIs directly to the system APIC irq domain.
+ * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
+ * pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ */
+static bool __init ms_hyperv_msi_ext_dest_id(void)
+{
+ return ms_hyperv.msi_ext_dest_id;
+}
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
}
#endif
-EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);
+
+const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft Hyper-V",
+ .detect = ms_hyperv_platform,
+ .type = X86_HYPER_MS_HYPERV,
+ .init.x2apic_available = ms_hyperv_x2apic_available,
+ .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
+ .init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
+};
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index ad9e5ed81181..aee4bc5ad496 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,4 @@
-obj-y := main.o if.o generic.o cleanup.o
-obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y := mtrr.o if.o generic.o cleanup.o
+obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o legacy.o
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 92ba9cd31c9a..ef3e8e42b782 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/mtrr.h>
@@ -108,17 +109,11 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static const struct mtrr_ops amd_mtrr_ops = {
- .vendor = X86_VENDOR_AMD,
+const struct mtrr_ops amd_mtrr_ops = {
+ .var_regs = 2,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
.get_free_region = generic_get_free_region,
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index 316fe3e60a97..6f6c3ae92943 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/mm.h>
@@ -44,15 +45,6 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
return -ENOSPC;
}
-/*
- * Report boot time MCR setups
- */
-void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
-{
- centaur_mcr[mcr].low = lo;
- centaur_mcr[mcr].high = hi;
-}
-
static void
centaur_get_mcr(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type * type)
@@ -103,24 +95,18 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
*/
if (type != MTRR_TYPE_WRCOMB &&
(centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
- pr_warning("mtrr: only write-combining%s supported\n",
+ pr_warn("mtrr: only write-combining%s supported\n",
centaur_mcr_type ? " and uncacheable are" : " is");
return -EINVAL;
}
return 0;
}
-static const struct mtrr_ops centaur_mtrr_ops = {
- .vendor = X86_VENDOR_CENTAUR,
+const struct mtrr_ops centaur_mtrr_ops = {
+ .var_regs = 8,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
.get_free_region = centaur_get_free_region,
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff22e..763534d77f59 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.0+
/*
* MTRR (Memory Type Range Register) cleanup
*
* Copyright (C) 2009 Yinghai Lu
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/smp.h>
@@ -28,7 +14,7 @@
#include <linux/range.h>
#include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
@@ -56,10 +42,7 @@ static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
-static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
@@ -80,12 +63,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
base, base + size);
}
- if (debug_print) {
- printk(KERN_DEBUG "After WB checking\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
/* Take out UC ranges: */
for (i = 0; i < num_var_ranges; i++) {
@@ -98,9 +80,10 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
- printk(BIOS_BUG_MSG, i);
+ pr_warn(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
continue;
size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -112,24 +95,22 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size);
- if (debug_print) {
- printk(KERN_DEBUG "After UC checking\n");
- for (i = 0; i < RANGE_NUM; i++) {
- if (!range[i].end)
- continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+ Dprintk("After UC checking\n");
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
}
/* sort the ranges */
nr_range = clean_sort_range(range, RANGE_NUM);
- if (debug_print) {
- printk(KERN_DEBUG "After sorting\n");
- for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
- range[i].start, range[i].end);
- }
+
+ Dprintk("After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ Dprintk("MTRR MAP PFN: %016llx - %016llx\n",
+ range[i].start, range[i].end);
return nr_range;
}
@@ -164,16 +145,9 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
-static int __init mtrr_cleanup_debug_setup(char *str)
-{
- debug_print = 1;
- return 0;
-}
-early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
-
static void __init
set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
- unsigned char type, unsigned int address_bits)
+ unsigned char type)
{
u32 base_lo, base_hi, mask_lo, mask_hi;
u64 base, mask;
@@ -183,7 +157,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
return;
}
- mask = (1ULL << address_bits) - 1;
+ mask = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
mask &= ~((((u64)sizek) << 10) - 1);
base = ((u64)basek) << 10;
@@ -209,7 +183,7 @@ save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
range_state[reg].type = type;
}
-static void __init set_var_mtrr_all(unsigned int address_bits)
+static void __init set_var_mtrr_all(void)
{
unsigned long basek, sizek;
unsigned char type;
@@ -220,7 +194,7 @@ static void __init set_var_mtrr_all(unsigned int address_bits)
sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
type = range_state[reg].type;
- set_var_mtrr(reg, basek, sizek, type, address_bits);
+ set_var_mtrr(reg, basek, sizek, type);
}
}
@@ -267,7 +241,7 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1UL << align;
- if (debug_print) {
+ if (mtrr_debug) {
char start_factor = 'K', size_factor = 'K';
unsigned long start_base, size_base;
@@ -296,7 +270,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
+ unsigned long second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
@@ -304,7 +278,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
hole_basek = 0;
hole_sizek = 0;
- second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
@@ -435,7 +408,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
state->range_sizek = sizek - second_sizek;
}
-/* Mininum size of mtrr block that can take hole: */
+/* Minimum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
@@ -538,12 +511,12 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
- size_base = to_size_factor(size_base, &size_factor),
+ size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
- start_base = to_size_factor(start_base, &start_factor),
+ start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
- printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ Dprintk("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
i, start_base, start_factor,
size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -592,9 +565,16 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct range range_new[RANGE_NUM];
+ /*
+ * range_new should really be an automatic variable, but
+ * putting 4096 bytes on the stack is frowned upon, to put it
+ * mildly. It is safe to make it a static __initdata variable,
+ * since mtrr_calc_range_state is only called during init and
+ * there's no way it will call itself recursively.
+ */
+ static struct range range_new[RANGE_NUM] __initdata;
unsigned long range_sums_new;
- static int nr_range_new;
+ int nr_range_new;
int num_reg;
/* Convert ranges to var ranges state: */
@@ -674,7 +654,7 @@ static int __init mtrr_search_optimal_index(void)
return index_good;
}
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
unsigned long x_remove_base, x_remove_size;
unsigned long base, size, def, dummy;
@@ -683,7 +663,10 @@ int __init mtrr_cleanup(unsigned address_bits)
int index_good;
int i;
- if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ if (!mtrr_enabled())
+ return 0;
+
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || enable_mtrr_cleanup < 1)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
@@ -705,7 +688,7 @@ int __init mtrr_cleanup(unsigned address_bits)
return 0;
/* Print original var MTRRs at first, for debugging: */
- printk(KERN_DEBUG "original variable MTRRs\n");
+ Dprintk("original variable MTRRs\n");
print_out_mtrr_range_state();
memset(range, 0, sizeof(range));
@@ -725,7 +708,7 @@ int __init mtrr_cleanup(unsigned address_bits)
x_remove_base, x_remove_size);
range_sums = sum_ranges(range, nr_range);
- printk(KERN_INFO "total RAM covered: %ldM\n",
+ pr_info("total RAM covered: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
@@ -736,13 +719,12 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
if (!result[i].bad) {
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
}
- printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
- "will find optimal one\n");
+ pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
}
i = 0;
@@ -758,9 +740,9 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_calc_range_state(chunk_size, gran_size,
x_remove_base, x_remove_size, i);
- if (debug_print) {
+ if (mtrr_debug) {
mtrr_print_out_one_result(i);
- printk(KERN_INFO "\n");
+ pr_info("\n");
}
i++;
@@ -771,7 +753,7 @@ int __init mtrr_cleanup(unsigned address_bits)
index_good = mtrr_search_optimal_index();
if (index_good != -1) {
- printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ pr_info("Found optimal setting for mtrr clean up\n");
i = index_good;
mtrr_print_out_one_result(i);
@@ -781,8 +763,8 @@ int __init mtrr_cleanup(unsigned address_bits)
gran_size = result[i].gran_sizek;
gran_size <<= 10;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
- set_var_mtrr_all(address_bits);
- printk(KERN_DEBUG "New variable MTRRs\n");
+ set_var_mtrr_all();
+ Dprintk("New variable MTRRs\n");
print_out_mtrr_range_state();
return 1;
} else {
@@ -791,13 +773,13 @@ int __init mtrr_cleanup(unsigned address_bits)
mtrr_print_out_one_result(i);
}
- printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
- printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+ pr_info("mtrr_cleanup: can not find optimal value\n");
+ pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
return 0;
}
#else
-int __init mtrr_cleanup(unsigned address_bits)
+int __init mtrr_cleanup(void)
{
return 0;
}
@@ -825,12 +807,13 @@ int __init amd_special_default_mtrr(void)
{
u32 l, h;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return 0;
if (boot_cpu_data.x86 < 0xf)
return 0;
/* In case some hypervisor doesn't pass SYSCFG through: */
- if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
+ if (rdmsr_safe(MSR_AMD64_SYSCFG, &l, &h) < 0)
return 0;
/*
* Memory between 4GB and top of mem is forced WB by this magic bit.
@@ -854,7 +837,7 @@ real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn)
trim_size <<= PAGE_SHIFT;
trim_size -= trim_start;
- return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED);
+ return e820__range_update(trim_start, trim_size, E820_TYPE_RAM, E820_TYPE_RESERVED);
}
/**
@@ -876,15 +859,18 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
+ if (!mtrr_enabled())
+ return 0;
+
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
*/
- if (!is_cpu(INTEL) || disable_mtrr_trim)
+ if (!cpu_feature_enabled(X86_FEATURE_MTRR) || disable_mtrr_trim)
return 0;
rdmsr(MSR_MTRRdefType, def, dummy);
- def &= 0xff;
+ def &= MTRR_DEF_TYPE_TYPE;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
@@ -910,7 +896,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
if (!highest_pfn) {
- printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+ pr_info("CPU MTRRs all blank - virtualized system.\n");
return 0;
}
@@ -965,13 +951,14 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
end_pfn);
if (total_trim_size) {
- pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+ pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
if (!changed_by_mtrr_cleanup)
WARN_ON(1);
pr_info("update e820 for mtrr\n");
- update_e820();
+ e820__update_table_print();
return 1;
}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..238dad57d4d6 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
@@ -97,6 +98,7 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
case 7:
if (size < 0x40)
break;
+ fallthrough;
case 6:
case 5:
case 4:
@@ -137,9 +139,9 @@ static void prepare_set(void)
u32 cr0;
/* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
@@ -170,8 +172,8 @@ static void post_set(void)
write_cr0(read_cr0() & ~X86_CR0_CD);
/* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(cr4);
+ if (boot_cpu_has(X86_FEATURE_PGE))
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
@@ -232,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
-
- for (i = 0; i < 8; i++) {
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
- }
-
- post_set();
-}
-
-static const struct mtrr_ops cyrix_mtrr_ops = {
- .vendor = X86_VENDOR_CYRIX,
- .set_all = cyrix_set_all,
+const struct mtrr_ops cyrix_mtrr_ops = {
+ .var_regs = 8,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..0863733858dc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,20 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
-#define DEBUG
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/mm.h>
-
+#include <linux/cc_platform.h>
+#include <linux/string_choices.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
+#include <asm/cpu_device_id.h>
+#include <asm/hypervisor.h>
+#include <asm/mshyperv.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
@@ -30,19 +35,70 @@ static struct fixed_range_block fixed_range_blocks[] = {
{}
};
+struct cache_map {
+ u64 start;
+ u64 end;
+ u64 flags;
+ u64 type:8;
+ u64 fixed:1;
+};
+
+bool mtrr_debug;
+
+static int __init mtrr_param_setup(char *str)
+{
+ int rc = 0;
+
+ if (!str)
+ return -EINVAL;
+ if (!strcmp(str, "debug"))
+ mtrr_debug = true;
+ else
+ rc = -EINVAL;
+
+ return rc;
+}
+early_param("mtrr", mtrr_param_setup);
+
+/*
+ * CACHE_MAP_MAX is the maximum number of memory ranges in cache_map, where
+ * no 2 adjacent ranges have the same cache mode (those would be merged).
+ * The number is based on the worst case:
+ * - no two adjacent fixed MTRRs share the same cache mode
+ * - one variable MTRR is spanning a huge area with mode WB
+ * - 255 variable MTRRs with mode UC all overlap with the WB MTRR, creating 2
+ * additional ranges each (result like "ababababa...aba" with a = WB, b = UC),
+ * accounting for MTRR_MAX_VAR_RANGES * 2 - 1 range entries
+ * - a TOP_MEM2 area (even with overlapping an UC MTRR can't add 2 range entries
+ * to the possible maximum, as it always starts at 4GB, thus it can't be in
+ * the middle of that MTRR, unless that MTRR starts at 0, which would remove
+ * the initial "a" from the "abababa" pattern above)
+ * The map won't contain ranges with no matching MTRR (those fall back to the
+ * default cache mode).
+ */
+#define CACHE_MAP_MAX (MTRR_NUM_FIXED_RANGES + MTRR_MAX_VAR_RANGES * 2)
+
+static struct cache_map init_cache_map[CACHE_MAP_MAX] __initdata;
+static struct cache_map *cache_map __refdata = init_cache_map;
+static unsigned int cache_map_size = CACHE_MAP_MAX;
+static unsigned int cache_map_n;
+static unsigned int cache_map_fixed;
+
static unsigned long smp_changes_mask;
static int mtrr_state_set;
u64 mtrr_tom2;
struct mtrr_state_type mtrr_state;
-EXPORT_SYMBOL_GPL(mtrr_state);
+
+/* Reserved bits in the high portion of the MTRRphysBaseN MSR. */
+u32 phys_hi_rsvd;
/*
* BIOS is expected to clear MtrrFixDramModEn bit, see for example
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
@@ -53,13 +109,16 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
- rdmsr(MSR_K8_SYSCFG, lo, hi);
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
+ rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
- printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+ pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
" not cleared by BIOS, clearing this bit\n",
smp_processor_id());
lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
- mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi);
+ mtrr_wrmsr(MSR_AMD64_SYSCFG, lo, hi);
}
}
@@ -68,184 +127,429 @@ static u64 get_mtrr_size(u64 mask)
{
u64 size;
- mask >>= PAGE_SHIFT;
- mask |= size_or_mask;
+ mask |= (u64)phys_hi_rsvd << 32;
size = -mask;
- size <<= PAGE_SHIFT;
+
return size;
}
+static u8 get_var_mtrr_state(unsigned int reg, u64 *start, u64 *size)
+{
+ struct mtrr_var_range *mtrr = mtrr_state.var_ranges + reg;
+
+ if (!(mtrr->mask_lo & MTRR_PHYSMASK_V))
+ return MTRR_TYPE_INVALID;
+
+ *start = (((u64)mtrr->base_hi) << 32) + (mtrr->base_lo & PAGE_MASK);
+ *size = get_mtrr_size((((u64)mtrr->mask_hi) << 32) +
+ (mtrr->mask_lo & PAGE_MASK));
+
+ return mtrr->base_lo & MTRR_PHYSBASE_TYPE;
+}
+
+static u8 get_effective_type(u8 type1, u8 type2)
+{
+ if (type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH) ||
+ (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK))
+ return MTRR_TYPE_WRTHROUGH;
+
+ if (type1 != type2)
+ return MTRR_TYPE_UNCACHABLE;
+
+ return type1;
+}
+
+static void rm_map_entry_at(int idx)
+{
+ cache_map_n--;
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx, cache_map + idx + 1,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+}
+
/*
- * Check and return the effective type for MTRR-MTRR type overlap.
- * Returns 1 if the effective type is UNCACHEABLE, else returns 0
+ * Add an entry into cache_map at a specific index. Merges adjacent entries if
+ * appropriate. Return the number of merges for correcting the scan index
+ * (this is needed as merging will reduce the number of entries, which will
+ * result in skipping entries in future iterations if the scan index isn't
+ * corrected).
+ * Note that the corrected index can never go below -1 (resulting in being 0 in
+ * the next scan iteration), as "2" is returned only if the current index is
+ * larger than zero.
*/
-static int check_type_overlap(u8 *prev, u8 *curr)
+static int add_map_entry_at(u64 start, u64 end, u8 type, int idx)
{
- if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
- *prev = MTRR_TYPE_UNCACHABLE;
- *curr = MTRR_TYPE_UNCACHABLE;
- return 1;
+ bool merge_prev = false, merge_next = false;
+
+ if (start >= end)
+ return 0;
+
+ if (idx > 0) {
+ struct cache_map *prev = cache_map + idx - 1;
+
+ if (!prev->fixed && start == prev->end && type == prev->type)
+ merge_prev = true;
}
- if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
- (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
- *prev = MTRR_TYPE_WRTHROUGH;
- *curr = MTRR_TYPE_WRTHROUGH;
+ if (idx < cache_map_n) {
+ struct cache_map *next = cache_map + idx;
+
+ if (!next->fixed && end == next->start && type == next->type)
+ merge_next = true;
}
- if (*prev != *curr) {
- *prev = MTRR_TYPE_UNCACHABLE;
- *curr = MTRR_TYPE_UNCACHABLE;
+ if (merge_prev && merge_next) {
+ cache_map[idx - 1].end = cache_map[idx].end;
+ rm_map_entry_at(idx);
+ return 2;
+ }
+ if (merge_prev) {
+ cache_map[idx - 1].end = end;
+ return 1;
+ }
+ if (merge_next) {
+ cache_map[idx].start = start;
return 1;
}
+ /* Sanity check: the array should NEVER be too small! */
+ if (cache_map_n == cache_map_size) {
+ WARN(1, "MTRR cache mode memory map exhausted!\n");
+ cache_map_n = cache_map_fixed;
+ return 0;
+ }
+
+ if (cache_map_n > idx) {
+ memmove(cache_map + idx + 1, cache_map + idx,
+ sizeof(*cache_map) * (cache_map_n - idx));
+ }
+
+ cache_map[idx].start = start;
+ cache_map[idx].end = end;
+ cache_map[idx].type = type;
+ cache_map[idx].fixed = 0;
+ cache_map_n++;
+
return 0;
}
+/* Clear a part of an entry. Return 1 if start of entry is still valid. */
+static int clr_map_range_at(u64 start, u64 end, int idx)
+{
+ int ret = start != cache_map[idx].start;
+ u64 tmp;
+
+ if (start == cache_map[idx].start && end == cache_map[idx].end) {
+ rm_map_entry_at(idx);
+ } else if (start == cache_map[idx].start) {
+ cache_map[idx].start = end;
+ } else if (end == cache_map[idx].end) {
+ cache_map[idx].end = start;
+ } else {
+ tmp = cache_map[idx].end;
+ cache_map[idx].end = start;
+ add_map_entry_at(end, tmp, cache_map[idx].type, idx + 1);
+ }
+
+ return ret;
+}
+
/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- * corresponds only to [start:*partial_end].
- * Caller has to lookup again for [*partial_end:end].
+ * Add MTRR to the map. The current map is scanned and each part of the MTRR
+ * either overlapping with an existing entry or with a hole in the map is
+ * handled separately.
*/
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static void add_map_entry(u64 start, u64 end, u8 type)
{
+ u8 new_type, old_type;
+ u64 tmp;
int i;
- u64 base, mask;
- u8 prev_match, curr_match;
-
- *repeat = 0;
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
+
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ if (start >= cache_map[i].end)
+ continue;
+
+ if (start < cache_map[i].start) {
+ /* Region start has no overlap. */
+ tmp = min(end, cache_map[i].start);
+ i -= add_map_entry_at(start, tmp, type, i);
+ start = tmp;
+ continue;
+ }
+
+ new_type = get_effective_type(type, cache_map[i].type);
+ old_type = cache_map[i].type;
+
+ if (cache_map[i].fixed || new_type == old_type) {
+ /* Cut off start of new entry. */
+ start = cache_map[i].end;
+ continue;
}
+
+ /* Handle only overlapping part of region. */
+ tmp = min(end, cache_map[i].end);
+ i += clr_map_range_at(start, tmp, i);
+ i -= add_map_entry_at(start, tmp, new_type, i);
+ start = tmp;
}
+ /* Add rest of region after last map entry (rest might be empty). */
+ add_map_entry_at(start, end, type, i);
+}
+
+/* Add variable MTRRs to cache map. */
+static void map_add_var(void)
+{
+ u64 start, size;
+ unsigned int i;
+ u8 type;
+
/*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
+ * Add AMD TOP_MEM2 area. Can't be added in mtrr_build_map(), as it
+ * needs to be added again when rebuilding the map due to potentially
+ * having moved as a result of variable MTRRs for memory below 4GB.
*/
- if (!(mtrr_state.enabled & 2))
- return mtrr_state.def_type;
+ if (mtrr_tom2) {
+ add_map_entry(BIT_ULL(32), mtrr_tom2, MTRR_TYPE_WRBACK);
+ cache_map[cache_map_n - 1].fixed = 1;
+ }
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ for (i = 0; i < num_var_ranges; i++) {
+ type = get_var_mtrr_state(i, &start, &size);
+ if (type != MTRR_TYPE_INVALID)
+ add_map_entry(start, start + size, type);
+ }
+}
- if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
- continue;
+/*
+ * Rebuild map by replacing variable entries. Needs to be called when MTRR
+ * registers are being changed after boot, as such changes could include
+ * removals of registers, which are complicated to handle without rebuild of
+ * the map.
+ */
+void generic_rebuild_map(void)
+{
+ if (mtrr_if != &generic_mtrr_ops)
+ return;
- base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
- (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
- (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
-
- if (start_state != end_state) {
- /*
- * We have start:end spanning across an MTRR.
- * We split the region into
- * either
- * (start:mtrr_end) (mtrr_end:end)
- * or
- * (start:mtrr_start) (mtrr_start:end)
- * depending on kind of overlap.
- * Return the type for first region and a pointer to
- * the start of second region so that caller will
- * lookup again on the second region.
- * Note: This way we handle multiple overlaps as well.
- */
- if (start_state)
- *partial_end = base + get_mtrr_size(mask);
- else
- *partial_end = base;
-
- if (unlikely(*partial_end <= start)) {
- WARN_ON(1);
- *partial_end = start + PAGE_SIZE;
- }
+ cache_map_n = cache_map_fixed;
- end = *partial_end - 1; /* end is inclusive */
- *repeat = 1;
- }
+ map_add_var();
+}
- if ((start & mask) != (base & mask))
- continue;
+static unsigned int __init get_cache_map_size(void)
+{
+ return cache_map_fixed + 2 * num_var_ranges + (mtrr_tom2 != 0);
+}
- curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
+/* Build the cache_map containing the cache modes per memory range. */
+void __init mtrr_build_map(void)
+{
+ u64 start, end, size;
+ unsigned int i;
+ u8 type;
+
+ /* Add fixed MTRRs, optimize for adjacent entries with same type. */
+ if (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED) {
+ /*
+ * Start with 64k size fixed entries, preset 1st one (hence the
+ * loop below is starting with index 1).
+ */
+ start = 0;
+ end = size = 0x10000;
+ type = mtrr_state.fixed_ranges[0];
+
+ for (i = 1; i < MTRR_NUM_FIXED_RANGES; i++) {
+ /* 8 64k entries, then 16 16k ones, rest 4k. */
+ if (i == 8 || i == 24)
+ size >>= 2;
+
+ if (mtrr_state.fixed_ranges[i] != type) {
+ add_map_entry(start, end, type);
+ start = end;
+ type = mtrr_state.fixed_ranges[i];
+ }
+ end += size;
}
+ add_map_entry(start, end, type);
+ }
+
+ /* Mark fixed, they take precedence. */
+ for (i = 0; i < cache_map_n; i++)
+ cache_map[i].fixed = 1;
+ cache_map_fixed = cache_map_n;
+
+ map_add_var();
- if (check_type_overlap(&prev_match, &curr_match))
- return curr_match;
+ pr_info("MTRR map: %u entries (%u fixed + %u variable; max %u), built from %u variable MTRRs\n",
+ cache_map_n, cache_map_fixed, cache_map_n - cache_map_fixed,
+ get_cache_map_size(), num_var_ranges + (mtrr_tom2 != 0));
+
+ if (mtrr_debug) {
+ for (i = 0; i < cache_map_n; i++) {
+ pr_info("%3u: %016llx-%016llx %s\n", i,
+ cache_map[i].start, cache_map[i].end - 1,
+ mtrr_attrib_to_str(cache_map[i].type));
+ }
}
+}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
+/* Copy the cache_map from __initdata memory to dynamically allocated one. */
+void __init mtrr_copy_map(void)
+{
+ unsigned int new_size = get_cache_map_size();
+
+ if (!mtrr_state.enabled || !new_size) {
+ cache_map = NULL;
+ return;
}
- if (prev_match != 0xFF)
- return prev_match;
+ mutex_lock(&mtrr_mutex);
+
+ cache_map = kcalloc(new_size, sizeof(*cache_map), GFP_KERNEL);
+ if (cache_map) {
+ memmove(cache_map, init_cache_map,
+ cache_map_n * sizeof(*cache_map));
+ cache_map_size = new_size;
+ } else {
+ mtrr_state.enabled = 0;
+ pr_err("MTRRs disabled due to allocation failure for lookup map.\n");
+ }
- return mtrr_state.def_type;
+ mutex_unlock(&mtrr_mutex);
}
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * guest_force_mtrr_state - set static MTRR state for a guest
+ *
+ * Used to set MTRR state via different means (e.g. with data obtained from
+ * a hypervisor).
+ * Is allowed only for special cases when running virtualized. Must be called
+ * from the x86_init.hyper.init_platform() hook. It can be called only once.
+ * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
+ * is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
{
- u8 type, prev_type;
- int repeat;
- u64 partial_end;
+ unsigned int i;
+
+ /* Only allowed to be called once before mtrr_bp_init(). */
+ if (WARN_ON_ONCE(mtrr_state_set))
+ return;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ /* Only allowed when running virtualized. */
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return;
/*
- * Common path is with repeat = 0.
- * However, we can have cases where [start:end] spans across some
- * MTRR range. Do repeated lookups for that case here.
+ * Only allowed for special virtualization cases:
+ * - when running as Hyper-V, SEV-SNP guest using vTOM
+ * - when running as Xen PV guest
+ * - when running as SEV-SNP or TDX guest to avoid unnecessary
+ * VMM communication/Virtualization exceptions (#VC, #VE)
*/
- while (repeat) {
- prev_type = type;
- start = partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP) &&
+ !hv_is_isolation_supported() &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV) &&
+ !cpu_feature_enabled(X86_FEATURE_TDX_GUEST))
+ return;
- if (check_type_overlap(&prev_type, &type))
- return type;
+ /* Disable MTRR in order to disable MTRR modifications. */
+ setup_clear_cpu_cap(X86_FEATURE_MTRR);
+
+ if (var) {
+ if (num_var > MTRR_MAX_VAR_RANGES) {
+ pr_warn("Trying to overwrite MTRR state with %u variable entries\n",
+ num_var);
+ num_var = MTRR_MAX_VAR_RANGES;
+ }
+ for (i = 0; i < num_var; i++)
+ mtrr_state.var_ranges[i] = var[i];
+ num_var_ranges = num_var;
}
+ mtrr_state.def_type = def_type;
+ mtrr_state.enabled |= MTRR_STATE_MTRR_ENABLED;
+
+ mtrr_state_set = 1;
+}
+
+static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
+{
+ u8 effective_type;
+
+ if (type == MTRR_TYPE_INVALID)
+ return new_type;
+
+ effective_type = get_effective_type(type, new_type);
+ if (type != effective_type)
+ *uniform = 0;
+
+ return effective_type;
+}
+
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
+{
+ u8 type = MTRR_TYPE_INVALID;
+ unsigned int i;
+
+ if (!mtrr_state_set) {
+ /* Uniformity is unknown. */
+ *uniform = 0;
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ *uniform = 1;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_UNCACHABLE;
+
+ for (i = 0; i < cache_map_n && start < end; i++) {
+ /* Region after current map entry? -> continue with next one. */
+ if (start >= cache_map[i].end)
+ continue;
+
+ /* Start of region not covered by current map entry? */
+ if (start < cache_map[i].start) {
+ /* At least some part of region has default type. */
+ type = type_merge(type, mtrr_state.def_type, uniform);
+ /* End of region not covered, too? -> lookup done. */
+ if (end <= cache_map[i].start)
+ return type;
+ }
+
+ /* At least part of region covered by map entry. */
+ type = type_merge(type, cache_map[i].type, uniform);
+
+ start = cache_map[i].end;
+ }
+
+ /* End of region past last entry in map? -> use default type. */
+ if (start < end)
+ type = type_merge(type, mtrr_state.def_type, uniform);
+
return type;
}
@@ -288,7 +592,7 @@ static void get_fixed_ranges(mtrr_type *frs)
void mtrr_save_fixed_ranges(void *info)
{
- if (cpu_has_mtrr)
+ if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
}
@@ -301,8 +605,8 @@ static void __init print_fixed_last(void)
if (!last_fixed_end)
return;
- pr_debug(" %05X-%05X %s\n", last_fixed_start,
- last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
+ pr_info(" %05X-%05X %s\n", last_fixed_start,
+ last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type));
last_fixed_end = 0;
}
@@ -335,19 +639,18 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
int high_width;
- pr_debug("MTRR default type: %s\n",
- mtrr_attrib_to_str(mtrr_state.def_type));
+ pr_info("MTRR default type: %s\n",
+ mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
- pr_debug("MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ pr_info("MTRR fixed ranges %s:\n",
+ str_enabled_disabled(
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)));
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -359,40 +662,40 @@ static void __init print_mtrr_state(void)
/* tail */
print_fixed_last();
}
- pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
- high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
+ pr_info("MTRR variable ranges %s:\n",
+ str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED));
+ high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
- if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
- pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
- i,
- high_width,
- mtrr_state.var_ranges[i].base_hi,
- mtrr_state.var_ranges[i].base_lo >> 12,
- high_width,
- mtrr_state.var_ranges[i].mask_hi,
- mtrr_state.var_ranges[i].mask_lo >> 12,
- mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
+ if (mtrr_state.var_ranges[i].mask_lo & MTRR_PHYSMASK_V)
+ pr_info(" %u base %0*X%05X000 mask %0*X%05X000 %s\n",
+ i,
+ high_width,
+ mtrr_state.var_ranges[i].base_hi,
+ mtrr_state.var_ranges[i].base_lo >> 12,
+ high_width,
+ mtrr_state.var_ranges[i].mask_hi,
+ mtrr_state.var_ranges[i].mask_lo >> 12,
+ mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo &
+ MTRR_PHYSBASE_TYPE));
else
- pr_debug(" %u disabled\n", i);
+ pr_info(" %u disabled\n", i);
}
if (mtrr_tom2)
- pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
+ pr_info("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
- unsigned long flags;
unsigned lo, dummy;
unsigned int i;
vrs = mtrr_state.var_ranges;
rdmsr(MSR_MTRRcap, lo, dummy);
- mtrr_state.have_fixed = (lo >> 8) & 1;
+ mtrr_state.have_fixed = lo & MTRR_CAP_FIX;
for (i = 0; i < num_var_ranges; i++)
get_mtrr_var_range(i, &vrs[i]);
@@ -400,8 +703,8 @@ void __init get_mtrr_state(void)
get_fixed_ranges(mtrr_state.fixed_ranges);
rdmsr(MSR_MTRRdefType, lo, dummy);
- mtrr_state.def_type = (lo & 0xff);
- mtrr_state.enabled = (lo & 0xc00) >> 10;
+ mtrr_state.def_type = lo & MTRR_DEF_TYPE_TYPE;
+ mtrr_state.enabled = (lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT;
if (amd_special_default_mtrr()) {
unsigned low, high;
@@ -414,18 +717,12 @@ void __init get_mtrr_state(void)
mtrr_tom2 &= 0xffffff800000ULL;
}
- print_mtrr_state();
+ if (mtrr_debug)
+ print_mtrr_state();
mtrr_state_set = 1;
- /* PAT setup for BP. We need to go through sync steps here */
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
@@ -436,14 +733,14 @@ void __init mtrr_state_warn(void)
if (!mask)
return;
if (mask & MTRR_CHANGE_MASK_FIXED)
- pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_VARIABLE)
- pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
if (mask & MTRR_CHANGE_MASK_DEFTYPE)
- pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+ pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
- printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
- printk(KERN_INFO "mtrr: corrected configuration.\n");
+ pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+ pr_info("mtrr: corrected configuration.\n");
}
/*
@@ -454,8 +751,7 @@ void __init mtrr_state_warn(void)
void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
{
if (wrmsr_safe(msr, a, b) < 0) {
- printk(KERN_ERR
- "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+ pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
smp_processor_id(), msr, a, b);
}
}
@@ -522,7 +818,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
- if ((mask_lo & 0x800) == 0) {
+ if (!(mask_lo & MTRR_PHYSMASK_V)) {
/* Invalid (i.e. free) range */
*base = 0;
*size = 0;
@@ -533,8 +829,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
/* Work out the shifted address mask: */
- tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
- mask = size_or_mask | tmp;
+ tmp = (u64)mask_hi << 32 | (mask_lo & PAGE_MASK);
+ mask = (u64)phys_hi_rsvd << 32 | tmp;
/* Expand tmp with high bits to all 1s: */
hi = fls64(tmp);
@@ -542,7 +838,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1ULL<<(hi - 1)) - 1);
if (tmp != mask) {
- printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
mask = tmp;
}
@@ -552,9 +848,9 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
* This works correctly if size is a power of two, i.e. a
* contiguous range:
*/
- *size = -mask;
+ *size = -mask >> PAGE_SHIFT;
*base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
- *type = base_lo & 0xff;
+ *type = base_lo & MTRR_PHYSBASE_TYPE;
out_put_cpu:
put_cpu();
@@ -592,9 +888,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
bool changed = false;
rdmsr(MTRRphysBase_MSR(index), lo, hi);
- if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
- || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->base_lo & ~MTRR_PHYSBASE_RSVD) != (lo & ~MTRR_PHYSBASE_RSVD)
+ || (vr->base_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
changed = true;
@@ -602,9 +897,8 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), lo, hi);
- if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
- || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
- (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
+ if ((vr->mask_lo & ~MTRR_PHYSMASK_RSVD) != (lo & ~MTRR_PHYSMASK_RSVD)
+ || (vr->mask_hi & ~phys_hi_rsvd) != (hi & ~phys_hi_rsvd)) {
mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
changed = true;
}
@@ -616,7 +910,10 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
@@ -636,104 +933,46 @@ static unsigned long set_mtrr_state(void)
* Set_mtrr_restore restores the old value of MTRRdefType,
* so to set it we fiddle with the saved value:
*/
- if ((deftype_lo & 0xff) != mtrr_state.def_type
- || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
+ if ((deftype_lo & MTRR_DEF_TYPE_TYPE) != mtrr_state.def_type ||
+ ((deftype_lo & MTRR_DEF_TYPE_ENABLE) >> MTRR_STATE_SHIFT) != mtrr_state.enabled) {
- deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type |
- (mtrr_state.enabled << 10);
+ deftype_lo = (deftype_lo & MTRR_DEF_TYPE_DISABLE) |
+ mtrr_state.def_type |
+ (mtrr_state.enabled << MTRR_STATE_SHIFT);
change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
}
return change_mask;
}
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
- */
-
- raw_spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- __flush_tlb();
-
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
- wbinvd();
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & MTRR_DEF_TYPE_DISABLE, deftype_hi);
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- __flush_tlb();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & ~X86_CR0_CD);
-
- /* Restore value of CR4 */
- if (cpu_has_pge)
- write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
/* Use the atomic bitops to update the global mask */
- for (count = 0; count < sizeof mask * 8; ++count) {
+ for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
/**
@@ -755,7 +994,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
/*
@@ -766,15 +1005,15 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
memset(vr, 0, sizeof(struct mtrr_var_range));
} else {
vr->base_lo = base << PAGE_SHIFT | type;
- vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
- vr->mask_lo = -size << PAGE_SHIFT | 0x800;
- vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+ vr->base_hi = (base >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
+ vr->mask_lo = -size << PAGE_SHIFT | MTRR_PHYSMASK_V;
+ vr->mask_hi = (-size >> (32 - PAGE_SHIFT)) & ~phys_hi_rsvd;
mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi);
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
@@ -787,17 +1026,16 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
* For Intel PPro stepping <= 7
* must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF
*/
- if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model == 1 &&
- boot_cpu_data.x86_mask <= 7) {
+ if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO &&
+ boot_cpu_data.x86_stepping <= 7) {
if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
- pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+ pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
return -EINVAL;
}
if (!(base + size < 0x70000 || base > 0x7003F) &&
(type == MTRR_TYPE_WRCOMB
|| type == MTRR_TYPE_WRBACK)) {
- pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+ pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
return -EINVAL;
}
}
@@ -811,7 +1049,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
lbase = lbase >> 1, last = last >> 1)
;
if (lbase != last) {
- pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+ pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
return -EINVAL;
}
return 0;
@@ -821,7 +1059,7 @@ static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
rdmsr(MSR_MTRRcap, config, dummy);
- return config & (1 << 10);
+ return config & MTRR_CAP_WC;
}
int positive_have_wrcomb(void)
@@ -833,8 +1071,6 @@ int positive_have_wrcomb(void)
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..4049235b1bfe 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/capability.h>
#include <linux/seq_file.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
-#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -43,7 +43,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
max = num_var_ranges;
if (fcount == NULL) {
- fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
+ fcount = kcalloc(max, sizeof(*fcount), GFP_KERNEL);
if (!fcount)
return -ENOMEM;
FILE_FCOUNT(file) = fcount;
@@ -99,28 +99,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
char *ptr;
char line[LINE_SIZE];
int length;
- size_t linelen;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
memset(line, 0, LINE_SIZE);
- length = len;
- length--;
-
- if (length > LINE_SIZE - 1)
- length = LINE_SIZE - 1;
-
+ len = min_t(size_t, len, LINE_SIZE - 1);
+ length = strncpy_from_user(line, buf, len);
if (length < 0)
- return -EINVAL;
+ return length;
- if (copy_from_user(line, buf, length))
- return -EFAULT;
-
- linelen = strlen(line);
- ptr = line + linelen - 1;
- if (linelen && *ptr == '\n')
+ ptr = line + length - 1;
+ if (length && *ptr == '\n')
*ptr = '\0';
if (!strncmp(line, "disable=", 8)) {
@@ -149,17 +137,16 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
return -EINVAL;
ptr = skip_spaces(ptr + 5);
- for (i = 0; i < MTRR_NUM_TYPES; ++i) {
- if (strcmp(ptr, mtrr_strings[i]))
- continue;
- base >>= PAGE_SHIFT;
- size >>= PAGE_SHIFT;
- err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
- if (err < 0)
- return err;
- return len;
- }
- return -EINVAL;
+ i = match_string(mtrr_strings, MTRR_NUM_TYPES, ptr);
+ if (i < 0)
+ return i;
+
+ base >>= PAGE_SHIFT;
+ size >>= PAGE_SHIFT;
+ err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true);
+ if (err < 0)
+ return err;
+ return len;
}
static long
@@ -173,6 +160,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
struct mtrr_gentry gentry;
void __user *arg = (void __user *) __arg;
+ memset(&gentry, 0, sizeof(gentry));
+
switch (cmd) {
case MTRRIOC_ADD_ENTRY:
case MTRRIOC_SET_ENTRY:
@@ -182,12 +171,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
case MTRRIOC_SET_PAGE_ENTRY:
case MTRRIOC_DEL_PAGE_ENTRY:
case MTRRIOC_KILL_PAGE_ENTRY:
- if (copy_from_user(&sentry, arg, sizeof sentry))
+ if (copy_from_user(&sentry, arg, sizeof(sentry)))
return -EFAULT;
break;
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_from_user(&gentry, arg, sizeof gentry))
+ if (copy_from_user(&gentry, arg, sizeof(gentry)))
return -EFAULT;
break;
#ifdef CONFIG_COMPAT
@@ -232,8 +221,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -242,24 +229,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -285,8 +266,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -295,8 +274,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -304,16 +281,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -340,7 +313,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
switch (cmd) {
case MTRRIOC_GET_ENTRY:
case MTRRIOC_GET_PAGE_ENTRY:
- if (copy_to_user(arg, &gentry, sizeof gentry))
+ if (copy_to_user(arg, &gentry, sizeof(gentry)))
err = -EFAULT;
break;
#ifdef CONFIG_COMPAT
@@ -379,36 +352,13 @@ static int mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -425,15 +375,37 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -444,7 +416,7 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c
new file mode 100644
index 000000000000..2415ffaaf02c
--- /dev/null
+++ b/arch/x86/kernel/cpu/mtrr/legacy.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/syscore_ops.h>
+#include <asm/cpufeature.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+#include "mtrr.h"
+
+void mtrr_set_if(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ /* Pre-Athlon (K6) AMD CPU MTRRs */
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR))
+ mtrr_if = &amd_mtrr_ops;
+ break;
+ case X86_VENDOR_CENTAUR:
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR))
+ mtrr_if = &centaur_mtrr_ops;
+ break;
+ case X86_VENDOR_CYRIX:
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR))
+ mtrr_if = &cyrix_mtrr_ops;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * The suspend/resume methods are only for CPUs without MTRR. CPUs using generic
+ * MTRR driver don't require this.
+ */
+struct mtrr_value {
+ mtrr_type ltype;
+ unsigned long lbase;
+ unsigned long lsize;
+};
+
+static struct mtrr_value *mtrr_value;
+
+static int mtrr_save(void *data)
+{
+ int i;
+
+ if (!mtrr_value)
+ return -ENOMEM;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &mtrr_value[i].lbase,
+ &mtrr_value[i].lsize,
+ &mtrr_value[i].ltype);
+ }
+ return 0;
+}
+
+static void mtrr_restore(void *data)
+{
+ int i;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ if (mtrr_value[i].lsize) {
+ mtrr_if->set(i, mtrr_value[i].lbase,
+ mtrr_value[i].lsize,
+ mtrr_value[i].ltype);
+ }
+ }
+}
+
+static const struct syscore_ops mtrr_syscore_ops = {
+ .suspend = mtrr_save,
+ .resume = mtrr_restore,
+};
+
+static struct syscore mtrr_syscore = {
+ .ops = &mtrr_syscore_ops,
+};
+
+void mtrr_register_syscore(void)
+{
+ mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL);
+
+ /*
+ * The CPU has no MTRR and seems to not support SMP. They have
+ * specific drivers, we use a tricky method to support
+ * suspend/resume for them.
+ *
+ * TBD: is there any system with such CPU which supports
+ * suspend/resume? If no, we should remove the code.
+ */
+ register_syscore(&mtrr_syscore);
+}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index f961de9964c7..4b3d492afe17 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: LGPL-2.0+
/* Generic MTRR (Memory Type Range Register) driver.
Copyright (C) 1997-2000 Richard Gooch
Copyright (c) 2002 Patrick Mochel
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public
- License along with this library; if not, write to the Free
- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
Richard Gooch may be reached by email at rgooch@atnf.csiro.au
The postal address is:
Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
@@ -31,14 +18,12 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
-#define DEBUG
-
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/sort.h>
@@ -46,39 +31,33 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>
+#include <linux/rcupdate.h>
-#include <asm/processor.h>
-#include <asm/e820.h>
+#include <asm/cacheinfo.h>
+#include <asm/cpufeature.h>
+#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
-static DEFINE_MUTEX(mtrr_mutex);
-
-u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
-
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+DEFINE_MUTEX(mtrr_mutex);
const struct mtrr_ops *mtrr_if;
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void set_mtrr_ops(const struct mtrr_ops *ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
@@ -94,7 +73,7 @@ static int have_wrcomb(void)
if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
dev->revision <= 5) {
- pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+ pr_info("Serverworks LE rev < 6 detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
@@ -104,7 +83,7 @@ static int have_wrcomb(void)
*/
if (dev->vendor == PCI_VENDOR_ID_INTEL &&
dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
- pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n");
+ pr_info("Intel 450NX MMC detected. Write-combining disabled.\n");
pci_dev_put(dev);
return 0;
}
@@ -113,21 +92,6 @@ static int have_wrcomb(void)
return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0;
}
-/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
-{
- unsigned long config = 0, dummy;
-
- if (use_intel())
- rdmsr(MSR_MTRRcap, config, dummy);
- else if (is_cpu(AMD))
- config = 2;
- else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
- config = 8;
-
- num_var_ranges = config & 0xff;
-}
-
static void __init init_table(void)
{
int i, max;
@@ -155,25 +119,8 @@ static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
- /*
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
- * started the boot/resume sequence, this might be a duplicate
- * set_all()).
- */
- if (data->smp_reg != ~0U) {
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
- mtrr_if->set_all();
- }
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
return 0;
}
@@ -219,8 +166,8 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* Note that the mechanism is the same for UP systems, too; all the SMP stuff
* becomes nops.
*/
-static void
-set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
+static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size,
+ mtrr_type type)
{
struct set_mtrr_data data = { .smp_reg = reg,
.smp_base = base,
@@ -228,20 +175,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
.smp_type = type
};
- stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
-}
-
-static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data = { .smp_reg = reg,
- .smp_base = base,
- .smp_size = size,
- .smp_type = type
- };
+ stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
- stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
- cpu_callout_mask);
+ generic_rebuild_map();
}
/**
@@ -286,7 +222,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
int i, replace, error;
mtrr_type ltype;
- if (!mtrr_if)
+ if (!mtrr_enabled())
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
@@ -294,24 +230,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
return error;
if (type >= MTRR_NUM_TYPES) {
- pr_warning("mtrr: type: %u invalid\n", type);
+ pr_warn("type: %u invalid\n", type);
return -EINVAL;
}
/* If the type is WC, check that this processor supports it */
if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
- pr_warning("mtrr: your processor doesn't support write-combining\n");
+ pr_warn("your processor doesn't support write-combining\n");
return -ENOSYS;
}
if (!size) {
- pr_warning("mtrr: zero sized request\n");
+ pr_warn("zero sized request\n");
return -EINVAL;
}
if ((base | (base + size - 1)) >>
(boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
- pr_warning("mtrr: base or size exceeds the MTRR width\n");
+ pr_warn("base or size exceeds the MTRR width\n");
return -EINVAL;
}
@@ -319,7 +255,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
replace = -1;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
/* Search for existing MTRR */
mutex_lock(&mtrr_mutex);
@@ -342,8 +278,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
} else if (types_compatible(type, ltype))
continue;
}
- pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%lx000\n", base, size, lbase,
+ pr_warn("0x%lx000,0x%lx000 overlaps existing 0x%lx000,0x%lx000\n", base, size, lbase,
lsize);
goto out;
}
@@ -351,7 +286,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
if (ltype != type) {
if (types_compatible(type, ltype))
continue;
- pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+ pr_warn("type mismatch for %lx000,%lx000 old: %s new: %s\n",
base, size, mtrr_attrib_to_str(ltype),
mtrr_attrib_to_str(type));
goto out;
@@ -377,20 +312,20 @@ int mtrr_add_page(unsigned long base, unsigned long size,
}
}
} else {
- pr_info("mtrr: no more MTRRs available\n");
+ pr_info("no more MTRRs available\n");
}
error = i;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
static int mtrr_check(unsigned long base, unsigned long size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
- pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
- pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ pr_warn("size and base must be multiples of 4 kiB\n");
+ Dprintk("size: 0x%lx base: 0x%lx\n", size, base);
dump_stack();
return -1;
}
@@ -435,12 +370,13 @@ static int mtrr_check(unsigned long base, unsigned long size)
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
-EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
@@ -463,12 +399,12 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
unsigned long lbase, lsize;
int error = -EINVAL;
- if (!mtrr_if)
- return -ENXIO;
+ if (!mtrr_enabled())
+ return -ENODEV;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&mtrr_mutex);
if (reg < 0) {
/* Search for existing MTRR */
@@ -480,22 +416,21 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
}
}
if (reg < 0) {
- pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n",
- base, size);
+ Dprintk("no MTRR for %lx000,%lx000 found\n", base, size);
goto out;
}
}
if (reg >= max) {
- pr_warning("mtrr: register: %d too big\n", reg);
+ pr_warn("register: %d too big\n", reg);
goto out;
}
mtrr_if->get(reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
- pr_warning("mtrr: MTRR %d not used\n", reg);
+ pr_warn("MTRR %d not used\n", reg);
goto out;
}
if (mtrr_usage_table[reg] < 1) {
- pr_warning("mtrr: reg: %d has count=0\n", reg);
+ pr_warn("reg: %d has count=0\n", reg);
goto out;
}
if (--mtrr_usage_table[reg] < 1)
@@ -503,7 +438,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
error = reg;
out:
mutex_unlock(&mtrr_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return error;
}
@@ -523,11 +458,12 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
-EXPORT_SYMBOL(mtrr_del);
/**
* arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
@@ -538,6 +474,9 @@ EXPORT_SYMBOL(mtrr_del);
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
@@ -545,7 +484,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
- if (pat_enabled)
+ if (pat_enabled() || !mtrr_enabled())
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +516,7 @@ void arch_phys_wc_del(int handle)
EXPORT_SYMBOL(arch_phys_wc_del);
/*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,256 +526,108 @@ EXPORT_SYMBOL(arch_phys_wc_del);
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
-
-/*
- * HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-static void __init init_ifs(void)
-{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
-/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
- * MTRR driver doesn't require this
- */
-struct mtrr_value {
- mtrr_type ltype;
- unsigned long lbase;
- unsigned long lsize;
-};
-
-static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
-
-static int mtrr_save(void)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &mtrr_value[i].lbase,
- &mtrr_value[i].lsize,
- &mtrr_value[i].ltype);
- }
- return 0;
-}
-
-static void mtrr_restore(void)
-{
- int i;
-
- for (i = 0; i < num_var_ranges; i++) {
- if (mtrr_value[i].lsize) {
- set_mtrr(i, mtrr_value[i].lbase,
- mtrr_value[i].lsize,
- mtrr_value[i].ltype);
- }
- }
-}
-
-
-
-static struct syscore_ops mtrr_syscore_ops = {
- .suspend = mtrr_save,
- .resume = mtrr_restore,
-};
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
int __initdata changed_by_mtrr_cleanup;
-#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1))
/**
- * mtrr_bp_init - initialize mtrrs on the boot CPU
+ * mtrr_bp_init - initialize MTRRs on the boot CPU
*
* This needs to be called early; before any of the other CPUs are
* initialized (i.e. before smp_init()).
- *
*/
void __init mtrr_bp_init(void)
{
- u32 phys_addr;
-
- init_ifs();
+ bool generic_mtrrs = cpu_feature_enabled(X86_FEATURE_MTRR);
+ const char *why = "(not available)";
+ unsigned long config, dummy;
- phys_addr = 32;
-
- if (cpu_has_mtrr) {
- mtrr_if = &generic_mtrr_ops;
- size_or_mask = SIZE_OR_MASK_BITS(36);
- size_and_mask = 0x00f00000;
- phys_addr = 36;
+ phys_hi_rsvd = GENMASK(31, boot_cpu_data.x86_phys_bits - 32);
+ if (!generic_mtrrs && mtrr_state.enabled) {
/*
- * This is an AMD specific MSR, but we assume(hope?) that
- * Intel will implement it too when they extend the address
- * bus of the Xeon.
+ * Software overwrite of MTRR state, only for generic case.
+ * Note that X86_FEATURE_MTRR has been reset in this case.
*/
- if (cpuid_eax(0x80000000) >= 0x80000008) {
- phys_addr = cpuid_eax(0x80000008) & 0xff;
- /* CPUID workaround for Intel 0F33/0F34 CPU */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- boot_cpu_data.x86 == 0xF &&
- boot_cpu_data.x86_model == 0x3 &&
- (boot_cpu_data.x86_mask == 0x3 ||
- boot_cpu_data.x86_mask == 0x4))
- phys_addr = 36;
-
- size_or_mask = SIZE_OR_MASK_BITS(phys_addr);
- size_and_mask = ~size_or_mask & 0xfffff00000ULL;
- } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
- boot_cpu_data.x86 == 6) {
- /*
- * VIA C* family have Intel style MTRRs,
- * but don't support PAE
- */
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- phys_addr = 32;
- }
- } else {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (cpu_has_k6_mtrr) {
- /* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CENTAUR:
- if (cpu_has_centaur_mcr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- case X86_VENDOR_CYRIX:
- if (cpu_has_cyrix_arr) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
- size_or_mask = SIZE_OR_MASK_BITS(32);
- size_and_mask = 0;
- }
- break;
- default:
- break;
- }
+ init_table();
+ mtrr_build_map();
+ pr_info("MTRRs set to read-only\n");
+
+ return;
}
- if (mtrr_if) {
- set_num_var_ranges();
- init_table();
- if (use_intel()) {
- get_mtrr_state();
+ if (generic_mtrrs)
+ mtrr_if = &generic_mtrr_ops;
+ else
+ mtrr_set_if();
+
+ if (mtrr_enabled()) {
+ /* Get the number of variable MTRR ranges. */
+ if (mtrr_if == &generic_mtrr_ops)
+ rdmsr(MSR_MTRRcap, config, dummy);
+ else
+ config = mtrr_if->var_regs;
+ num_var_ranges = config & MTRR_CAP_VCNT;
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
+ init_table();
+ if (mtrr_if == &generic_mtrr_ops) {
+ /* BIOS may override */
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup();
+ mtrr_build_map();
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
}
}
}
-}
-void mtrr_ap_init(void)
-{
- if (!use_intel() || mtrr_aps_delayed_init)
- return;
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries
- * changed, but this routine will be called in cpu boot time,
- * holding the lock breaks it.
- *
- * This routine is called in two cases:
- *
- * 1. very earily time of software resume, when there absolutely
- * isn't mtrr entry changes;
- *
- * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
- * lock to prevent mtrr entry changes
- */
- set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+ if (!mtrr_enabled())
+ pr_info("MTRRs disabled %s\n", why);
}
/**
- * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
+ * mtrr_save_state - Save current fixed-range MTRR state of the first
+ * cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
int first_cpu;
- get_online_cpus();
- first_cpu = cpumask_first(cpu_online_mask);
- smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
- put_online_cpus();
-}
-
-void set_mtrr_aps_delayed_init(void)
-{
- if (!use_intel())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
- mtrr_aps_delayed_init = true;
+ first_cpu = cpumask_first(cpu_online_mask);
+ smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
}
-/*
- * Delayed MTRR initialization for all AP's
- */
-void mtrr_aps_init(void)
+static int __init mtrr_init_finalize(void)
{
- if (!use_intel())
- return;
-
/*
- * Check if someone has requested the delay of AP MTRR initialization,
- * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
- * then we are done.
+ * Map might exist if guest_force_mtrr_state() has been called or if
+ * mtrr_enabled() returns true.
*/
- if (!mtrr_aps_delayed_init)
- return;
-
- set_mtrr(~0U, 0, 0, 0);
- mtrr_aps_delayed_init = false;
-}
-
-void mtrr_bp_restore(void)
-{
- if (!use_intel())
- return;
-
- mtrr_if->set_all();
-}
+ mtrr_copy_map();
-static int __init mtrr_init_finialize(void)
-{
- if (!mtrr_if)
+ if (!mtrr_enabled())
return 0;
- if (use_intel()) {
+ if (memory_caching_control & CACHE_MTRR) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
}
- /*
- * The CPU has no MTRR and seems to not support SMP. They have
- * specific drivers, we use a tricky method to support
- * suspend/resume for them.
- *
- * TBD: is there any system with such CPU which supports
- * suspend/resume? If no, we should remove the code.
- */
- register_syscore_ops(&mtrr_syscore_ops);
+ mtrr_register_syscore();
return 0;
}
-subsys_initcall(mtrr_init_finialize);
+subsys_initcall(mtrr_init_finalize);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27..2de3bd2f95d1 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* local MTRR defines.
*/
@@ -9,15 +10,15 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
+extern bool mtrr_debug;
+#define Dprintk(x...) do { if (mtrr_debug) pr_info(x); } while (0)
+
extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
- u32 vendor;
- u32 use_intel_if;
+ u32 var_regs;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
@@ -45,34 +46,45 @@ struct set_mtrr_context {
u32 ccr3;
};
-void set_mtrr_done(struct set_mtrr_context *ctxt);
-void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
-void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
-
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
-
-extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
-
-#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
+extern struct mutex mtrr_mutex;
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
extern struct mtrr_state_type mtrr_state;
+extern u32 phys_hi_rsvd;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+#ifdef CONFIG_X86_32
+void mtrr_set_if(void);
+void mtrr_register_syscore(void);
+#else
+static inline void mtrr_set_if(void) { }
+static inline void mtrr_register_syscore(void) { }
+#endif
+void mtrr_build_map(void);
+void mtrr_copy_map(void);
+
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
-extern int mtrr_cleanup(unsigned address_bits);
+extern int mtrr_cleanup(void);
+
+/*
+ * Must be used by code which uses mtrr_if to call platform-specific
+ * MTRR manipulation functions.
+ */
+static inline bool mtrr_enabled(void)
+{
+ return !!mtrr_if;
+}
+void generic_rebuild_map(void);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
deleted file mode 100644
index 9e581c5cf6d0..000000000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,2132 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2009 Jaswinder Singh Rajput
- * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- * Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
- .enabled = 1,
-};
-
-u64 __read_mostly hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int shift = 64 - x86_pmu.cntval_bits;
- u64 prev_raw_count, new_raw_count;
- int idx = hwc->idx;
- s64 delta;
-
- if (idx == INTEL_PMC_IDX_FIXED_BTS)
- return 0;
-
- /*
- * Careful: an NMI might modify the previous event value.
- *
- * Our tactic to handle this is to first atomically read and
- * exchange a new raw count - then add that new-prev delta
- * count to the generic event atomically:
- */
-again:
- prev_raw_count = local64_read(&hwc->prev_count);
- rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- goto again;
-
- /*
- * Now we have the new raw value and have updated the prev
- * timestamp already. We can now calculate the elapsed delta
- * (event-)time and add that to the generic event.
- *
- * Careful, not all hw sign-extends above the physical width
- * of the count.
- */
- delta = (new_raw_count << shift) - (prev_raw_count << shift);
- delta >>= shift;
-
- local64_add(delta, &event->count);
- local64_sub(delta, &hwc->period_left);
-
- return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg;
- struct extra_reg *er;
-
- reg = &event->hw.extra_reg;
-
- if (!x86_pmu.extra_regs)
- return 0;
-
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- if (er->event != (config & er->config_mask))
- continue;
- if (event->attr.config1 & ~er->valid_mask)
- return -EINVAL;
-
- reg->idx = er->idx;
- reg->config = event->attr.config1;
- reg->reg = er->msr;
- break;
- }
- return 0;
-}
-
-static atomic_t active_events;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
- goto perfctr_fail;
- }
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
- goto eventsel_fail;
- }
-
- return true;
-
-eventsel_fail:
- for (i--; i >= 0; i--)
- release_evntsel_nmi(x86_pmu_config_addr(i));
-
- i = x86_pmu.num_counters;
-
-perfctr_fail:
- for (i--; i >= 0; i--)
- release_perfctr_nmi(x86_pmu_event_addr(i));
-
- return false;
-}
-
-static void release_pmc_hardware(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.num_counters; i++) {
- release_perfctr_nmi(x86_pmu_event_addr(i));
- release_evntsel_nmi(x86_pmu_config_addr(i));
- }
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
- u64 val, val_fail, val_new= ~0;
- int i, reg, reg_fail, ret = 0;
- int bios_fail = 0;
-
- /*
- * Check to see if the BIOS enabled any of the counters, if so
- * complain and bail.
- */
- for (i = 0; i < x86_pmu.num_counters; i++) {
- reg = x86_pmu_config_addr(i);
- ret = rdmsrl_safe(reg, &val);
- if (ret)
- goto msr_fail;
- if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
- bios_fail = 1;
- val_fail = val;
- reg_fail = reg;
- }
- }
-
- if (x86_pmu.num_counters_fixed) {
- reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- ret = rdmsrl_safe(reg, &val);
- if (ret)
- goto msr_fail;
- for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
- if (val & (0x03 << i*4)) {
- bios_fail = 1;
- val_fail = val;
- reg_fail = reg;
- }
- }
- }
-
- /*
- * Read the current value, change it and read it back to see if it
- * matches, this is needed to detect certain hardware emulators
- * (qemu/kvm) that don't trap on the MSR access and always return 0s.
- */
- reg = x86_pmu_event_addr(0);
- if (rdmsrl_safe(reg, &val))
- goto msr_fail;
- val ^= 0xffffUL;
- ret = wrmsrl_safe(reg, val);
- ret |= rdmsrl_safe(reg, &val_new);
- if (ret || val != val_new)
- goto msr_fail;
-
- /*
- * We still allow the PMU driver to operate:
- */
- if (bios_fail) {
- printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
- printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
- }
-
- return true;
-
-msr_fail:
- printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
- printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
-
- return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
- if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
- release_pmc_hardware();
- release_ds_buffers();
- mutex_unlock(&pmc_reserve_mutex);
- }
-}
-
-static inline int x86_pmu_initialized(void)
-{
- return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- unsigned int cache_type, cache_op, cache_result;
- u64 config, val;
-
- config = attr->config;
-
- cache_type = (config >> 0) & 0xff;
- if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
- return -EINVAL;
-
- cache_op = (config >> 8) & 0xff;
- if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
- return -EINVAL;
-
- cache_result = (config >> 16) & 0xff;
- if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
- return -EINVAL;
-
- val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
- if (val == 0)
- return -ENOENT;
-
- if (val == -1)
- return -EINVAL;
-
- hwc->config |= val;
- attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
- return x86_pmu_extra_regs(val, event);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
- struct perf_event_attr *attr = &event->attr;
- struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- if (!is_sampling_event(event)) {
- hwc->sample_period = x86_pmu.max_period;
- hwc->last_period = hwc->sample_period;
- local64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
- }
-
- if (attr->type == PERF_TYPE_RAW)
- return x86_pmu_extra_regs(event->attr.config, event);
-
- if (attr->type == PERF_TYPE_HW_CACHE)
- return set_ext_hw_attr(hwc, event);
-
- if (attr->config >= x86_pmu.max_events)
- return -EINVAL;
-
- /*
- * The generic map:
- */
- config = x86_pmu.event_map(attr->config);
-
- if (config == 0)
- return -ENOENT;
-
- if (config == -1LL)
- return -EINVAL;
-
- /*
- * Branch tracing:
- */
- if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
- !attr->freq && hwc->sample_period == 1) {
- /* BTS is not supported by this architecture. */
- if (!x86_pmu.bts_active)
- return -EOPNOTSUPP;
-
- /* BTS is currently only allowed for user-mode. */
- if (!attr->exclude_kernel)
- return -EOPNOTSUPP;
- }
-
- hwc->config |= config;
-
- return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
- u64 m = event->attr.branch_sample_type;
- u64 b = 0;
-
- /* must capture all branches */
- if (!(m & PERF_SAMPLE_BRANCH_ANY))
- return 0;
-
- m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_user)
- b |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- b |= PERF_SAMPLE_BRANCH_KERNEL;
-
- /*
- * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
- */
-
- return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
- if (event->attr.precise_ip) {
- int precise = 0;
-
- /* Support for constant skid */
- if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
- precise++;
-
- /* Support for IP fixup */
- if (x86_pmu.lbr_nr)
- precise++;
- }
-
- if (event->attr.precise_ip > precise)
- return -EOPNOTSUPP;
- /*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
- */
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
- }
- }
-
- /*
- * Generate PMC IRQs:
- * (keep 'enabled' bit clear for now)
- */
- event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
- /*
- * Count user and OS events unless requested not to
- */
- if (!event->attr.exclude_user)
- event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
- if (!event->attr.exclude_kernel)
- event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
- if (event->attr.type == PERF_TYPE_RAW)
- event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
- return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
- int err;
-
- if (!x86_pmu_initialized())
- return -ENODEV;
-
- err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
- if (!reserve_pmc_hardware())
- err = -EBUSY;
- else
- reserve_ds_buffers();
- }
- if (!err)
- atomic_inc(&active_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
-
- event->destroy = hw_perf_event_destroy;
-
- event->hw.idx = -1;
- event->hw.last_cpu = -1;
- event->hw.last_tag = ~0ULL;
-
- /* mark unused */
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
- event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
- return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- u64 val;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- rdmsrl(x86_pmu_config_addr(idx), val);
- if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
- continue;
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(x86_pmu_config_addr(idx), val);
- }
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu_initialized())
- return;
-
- if (!cpuc->enabled)
- return;
-
- cpuc->n_added = 0;
- cpuc->enabled = 0;
- barrier();
-
- x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
- }
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
- return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
- int weight;
- int event; /* event index */
- int counter; /* counter index */
- int unassigned; /* number of events to be assigned left */
- unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define SCHED_STATES_MAX 2
-
-struct perf_sched {
- int max_weight;
- int max_events;
- struct perf_event **events;
- struct sched_state state;
- int saved_states;
- struct sched_state saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
- int num, int wmin, int wmax)
-{
- int idx;
-
- memset(sched, 0, sizeof(*sched));
- sched->max_events = num;
- sched->max_weight = wmax;
- sched->events = events;
-
- for (idx = 0; idx < num; idx++) {
- if (events[idx]->hw.constraint->weight == wmin)
- break;
- }
-
- sched->state.event = idx; /* start with min weight */
- sched->state.weight = wmin;
- sched->state.unassigned = num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
- if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
- return;
-
- sched->saved[sched->saved_states] = sched->state;
- sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
- if (!sched->saved_states)
- return false;
-
- sched->saved_states--;
- sched->state = sched->saved[sched->saved_states];
-
- /* continue with next counter: */
- clear_bit(sched->state.counter++, sched->state.used);
-
- return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
- struct event_constraint *c;
- int idx;
-
- if (!sched->state.unassigned)
- return false;
-
- if (sched->state.event >= sched->max_events)
- return false;
-
- c = sched->events[sched->state.event]->hw.constraint;
- /* Prefer fixed purpose counters */
- if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
- idx = INTEL_PMC_IDX_FIXED;
- for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!__test_and_set_bit(idx, sched->state.used))
- goto done;
- }
- }
- /* Grab the first unused counter starting with idx */
- idx = sched->state.counter;
- for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
- if (!__test_and_set_bit(idx, sched->state.used))
- goto done;
- }
-
- return false;
-
-done:
- sched->state.counter = idx;
-
- if (c->overlap)
- perf_sched_save_state(sched);
-
- return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
- while (!__perf_sched_find_counter(sched)) {
- if (!perf_sched_restore_state(sched))
- return false;
- }
-
- return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
- struct event_constraint *c;
-
- if (!sched->state.unassigned || !--sched->state.unassigned)
- return false;
-
- do {
- /* next event */
- sched->state.event++;
- if (sched->state.event >= sched->max_events) {
- /* next weight */
- sched->state.event = 0;
- sched->state.weight++;
- if (sched->state.weight > sched->max_weight)
- return false;
- }
- c = sched->events[sched->state.event]->hw.constraint;
- } while (c->weight != sched->state.weight);
-
- sched->state.counter = 0; /* start with first counter */
-
- return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct perf_event **events, int n,
- int wmin, int wmax, int *assign)
-{
- struct perf_sched sched;
-
- perf_sched_init(&sched, events, n, wmin, wmax);
-
- do {
- if (!perf_sched_find_counter(&sched))
- break; /* failed */
- if (assign)
- assign[sched.state.event] = sched.state.counter;
- } while (perf_sched_next_event(&sched));
-
- return sched.state.unassigned;
-}
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
- struct event_constraint *c;
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- struct perf_event *e;
- int i, wmin, wmax, num = 0;
- struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
- for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- hwc = &cpuc->event_list[i]->hw;
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
- hwc->constraint = c;
-
- wmin = min(wmin, c->weight);
- wmax = max(wmax, c->weight);
- }
-
- /*
- * fastpath, try to reuse previous register
- */
- for (i = 0; i < n; i++) {
- hwc = &cpuc->event_list[i]->hw;
- c = hwc->constraint;
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- /* not already used */
- if (test_bit(hwc->idx, used_mask))
- break;
-
- __set_bit(hwc->idx, used_mask);
- if (assign)
- assign[i] = hwc->idx;
- }
-
- /* slow path */
- if (i != n)
- num = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
-
- /*
- * Mark the event as committed, so we do not put_constraint()
- * in case new events are added and fail scheduling.
- */
- if (!num && assign) {
- for (i = 0; i < n; i++) {
- e = cpuc->event_list[i];
- e->hw.flags |= PERF_X86_EVENT_COMMITTED;
- }
- }
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
- for (i = 0; i < n; i++) {
- e = cpuc->event_list[i];
- /*
- * do not put_constraint() on comitted events,
- * because they are good to go
- */
- if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
- continue;
-
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, e);
- }
- }
- return num ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
- struct perf_event *event;
- int n, max_count;
-
- max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
- /* current number of events already accepted */
- n = cpuc->n_events;
-
- if (is_x86_event(leader)) {
- if (n >= max_count)
- return -EINVAL;
- cpuc->event_list[n] = leader;
- n++;
- }
- if (!dogrp)
- return n;
-
- list_for_each_entry(event, &leader->sibling_list, group_entry) {
- if (!is_x86_event(event) ||
- event->state <= PERF_EVENT_STATE_OFF)
- continue;
-
- if (n >= max_count)
- return -EINVAL;
-
- cpuc->event_list[n] = event;
- n++;
- }
- return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
- struct cpu_hw_events *cpuc, int i)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- hwc->idx = cpuc->assign[i];
- hwc->last_cpu = smp_processor_id();
- hwc->last_tag = ++cpuc->tags[i];
-
- if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
- hwc->config_base = 0;
- hwc->event_base = 0;
- } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
- hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
- hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
- hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
- } else {
- hwc->config_base = x86_pmu_config_addr(hwc->idx);
- hwc->event_base = x86_pmu_event_addr(hwc->idx);
- hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
- }
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
- struct cpu_hw_events *cpuc,
- int i)
-{
- return hwc->idx == cpuc->assign[i] &&
- hwc->last_cpu == smp_processor_id() &&
- hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int i, added = cpuc->n_added;
-
- if (!x86_pmu_initialized())
- return;
-
- if (cpuc->enabled)
- return;
-
- if (cpuc->n_added) {
- int n_running = cpuc->n_events - cpuc->n_added;
- /*
- * apply assignment obtained either from
- * hw_perf_group_sched_in() or x86_pmu_enable()
- *
- * step1: save events moving to new counters
- * step2: reprogram moved events into new counters
- */
- for (i = 0; i < n_running; i++) {
- event = cpuc->event_list[i];
- hwc = &event->hw;
-
- /*
- * we can avoid reprogramming counter if:
- * - assigned same counter as last time
- * - running on same CPU as last time
- * - no other event has used the counter since
- */
- if (hwc->idx == -1 ||
- match_prev_assignment(hwc, cpuc, i))
- continue;
-
- /*
- * Ensure we don't accidentally enable a stopped
- * counter simply because we rescheduled.
- */
- if (hwc->state & PERF_HES_STOPPED)
- hwc->state |= PERF_HES_ARCH;
-
- x86_pmu_stop(event, PERF_EF_UPDATE);
- }
-
- for (i = 0; i < cpuc->n_events; i++) {
- event = cpuc->event_list[i];
- hwc = &event->hw;
-
- if (!match_prev_assignment(hwc, cpuc, i))
- x86_assign_hw_event(event, cpuc, i);
- else if (i < n_running)
- continue;
-
- if (hwc->state & PERF_HES_ARCH)
- continue;
-
- x86_pmu_start(event, PERF_EF_RELOAD);
- }
- cpuc->n_added = 0;
- perf_events_lapic_init();
- }
-
- cpuc->enabled = 1;
- barrier();
-
- x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- s64 left = local64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
- int ret = 0, idx = hwc->idx;
-
- if (idx == INTEL_PMC_IDX_FIXED_BTS)
- return 0;
-
- /*
- * If we are way outside a reasonable range then just skip forward:
- */
- if (unlikely(left <= -period)) {
- left = period;
- local64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
-
- if (unlikely(left <= 0)) {
- left += period;
- local64_set(&hwc->period_left, left);
- hwc->last_period = period;
- ret = 1;
- }
- /*
- * Quirk: certain CPUs dont like it if just 1 hw_event is left:
- */
- if (unlikely(left < 2))
- left = 2;
-
- if (left > x86_pmu.max_period)
- left = x86_pmu.max_period;
-
- per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
- /*
- * The hw event starts counting from this event offset,
- * mark it to be able to extra future deltas:
- */
- local64_set(&hwc->prev_count, (u64)-left);
-
- wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-
- /*
- * Due to erratum on certan cpu we need
- * a second write to be sure the register
- * is updated properly
- */
- if (x86_pmu.perfctr_second_write) {
- wrmsrl(hwc->event_base,
- (u64)(-left) & x86_pmu.cntval_mask);
- }
-
- perf_event_update_userpage(event);
-
- return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
- if (__this_cpu_read(cpu_hw_events.enabled))
- __x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc;
- int assign[X86_PMC_IDX_MAX];
- int n, n0, ret;
-
- hwc = &event->hw;
-
- perf_pmu_disable(event->pmu);
- n0 = cpuc->n_events;
- ret = n = collect_events(cpuc, event, false);
- if (ret < 0)
- goto out;
-
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- if (!(flags & PERF_EF_START))
- hwc->state |= PERF_HES_ARCH;
-
- /*
- * If group events scheduling transaction was started,
- * skip the schedulability test here, it will be performed
- * at commit time (->commit_txn) as a whole
- */
- if (cpuc->group_flag & PERF_EVENT_TXN)
- goto done_collect;
-
- ret = x86_pmu.schedule_events(cpuc, n, assign);
- if (ret)
- goto out;
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
- cpuc->n_events = n;
- cpuc->n_added += n - n0;
- cpuc->n_txn += n - n0;
-
- ret = 0;
-out:
- perf_pmu_enable(event->pmu);
- return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx = event->hw.idx;
-
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
- return;
-
- if (WARN_ON_ONCE(idx == -1))
- return;
-
- if (flags & PERF_EF_RELOAD) {
- WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
- x86_perf_event_set_period(event);
- }
-
- event->hw.state = 0;
-
- cpuc->events[idx] = event;
- __set_bit(idx, cpuc->active_mask);
- __set_bit(idx, cpuc->running);
- x86_pmu.enable(event);
- perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
- u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
- struct cpu_hw_events *cpuc;
- unsigned long flags;
- int cpu, idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_events, cpu);
-
- if (x86_pmu.version >= 2) {
- rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
- rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-
- pr_info("\n");
- pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
- pr_info("CPU#%d: status: %016llx\n", cpu, status);
- pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
- pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
- }
- pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
- rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
- prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
- pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
- cpu, idx, pmc_ctrl);
- pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
- cpu, idx, prev_left);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
- pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
- cpu, idx, pmc_count);
- }
- local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
- x86_pmu.disable(event);
- cpuc->events[hwc->idx] = NULL;
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
- }
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- x86_perf_event_update(event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int i;
-
- /*
- * event is descheduled
- */
- event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
- /*
- * If we're called during a txn, we don't need to do anything.
- * The events never got scheduled and ->cancel_txn will truncate
- * the event_list.
- */
- if (cpuc->group_flag & PERF_EVENT_TXN)
- return;
-
- x86_pmu_stop(event, PERF_EF_UPDATE);
-
- for (i = 0; i < cpuc->n_events; i++) {
- if (event == cpuc->event_list[i]) {
-
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, event);
-
- while (++i < cpuc->n_events)
- cpuc->event_list[i-1] = cpuc->event_list[i];
-
- --cpuc->n_events;
- break;
- }
- }
- perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- int idx, handled = 0;
- u64 val;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- /*
- * Some chipsets need to unmask the LVTPC in a particular spot
- * inside the nmi handler. As a result, the unmasking was pushed
- * into all the nmi handlers.
- *
- * This generic handler doesn't seem to have any issues where the
- * unmasking occurs so it was left at the top.
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- if (!test_bit(idx, cpuc->active_mask)) {
- /*
- * Though we deactivated the counter some cpus
- * might still deliver spurious interrupts still
- * in flight. Catch them:
- */
- if (__test_and_clear_bit(idx, cpuc->running))
- handled++;
- continue;
- }
-
- event = cpuc->events[idx];
-
- val = x86_perf_event_update(event);
- if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
- continue;
-
- /*
- * event overflow
- */
- handled++;
- perf_sample_data_init(&data, 0, event->hw.last_period);
-
- if (!x86_perf_event_set_period(event))
- continue;
-
- if (perf_event_overflow(event, &data, regs))
- x86_pmu_stop(event, 0);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-void perf_events_lapic_init(void)
-{
- if (!x86_pmu.apic || !x86_pmu_initialized())
- return;
-
- /*
- * Always use NMI for PMU
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int __kprobes
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
- int ret;
- u64 start_clock;
- u64 finish_clock;
-
- if (!atomic_read(&active_events))
- return NMI_DONE;
-
- start_clock = local_clock();
- ret = x86_pmu.handle_irq(regs);
- finish_clock = local_clock();
-
- perf_sample_event_took(finish_clock - start_clock);
-
- return ret;
-}
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int __cpuinit
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int ret = NOTIFY_OK;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- cpuc->kfree_on_online = NULL;
- if (x86_pmu.cpu_prepare)
- ret = x86_pmu.cpu_prepare(cpu);
- break;
-
- case CPU_STARTING:
- if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
- if (x86_pmu.cpu_starting)
- x86_pmu.cpu_starting(cpu);
- break;
-
- case CPU_ONLINE:
- kfree(cpuc->kfree_on_online);
- break;
-
- case CPU_DYING:
- if (x86_pmu.cpu_dying)
- x86_pmu.cpu_dying(cpu);
- break;
-
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- if (x86_pmu.cpu_dead)
- x86_pmu.cpu_dead(cpu);
- break;
-
- default:
- break;
- }
-
- return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
- if (cpu_has_apic)
- return;
-
- x86_pmu.apic = 0;
- pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
- pr_info("no hardware sampling interrupt available.\n");
-}
-
-static struct attribute_group x86_pmu_format_group = {
- .name = "format",
- .attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
- struct device_attribute *d;
- struct perf_pmu_events_attr *pmu_attr;
- int i, j;
-
- for (i = 0; attrs[i]; i++) {
- d = (struct device_attribute *)attrs[i];
- pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
- /* str trumps id */
- if (pmu_attr->event_str)
- continue;
- if (x86_pmu.event_map(i))
- continue;
-
- for (j = i; attrs[j]; j++)
- attrs[j] = attrs[j + 1];
-
- /* Check the shifted attr. */
- i--;
- }
-}
-
-/* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
- struct attribute **new;
- int j, i;
-
- for (j = 0; a[j]; j++)
- ;
- for (i = 0; b[i]; i++)
- j++;
- j++;
-
- new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
- if (!new)
- return NULL;
-
- j = 0;
- for (i = 0; a[i]; i++)
- new[j++] = a[i];
- for (i = 0; b[i]; i++)
- new[j++] = b[i];
- new[j] = NULL;
-
- return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
- char *page)
-{
- struct perf_pmu_events_attr *pmu_attr = \
- container_of(attr, struct perf_pmu_events_attr, attr);
- u64 config = x86_pmu.event_map(pmu_attr->id);
-
- /* string trumps id */
- if (pmu_attr->event_str)
- return sprintf(page, "%s", pmu_attr->event_str);
-
- return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles, CPU_CYCLES );
-EVENT_ATTR(instructions, INSTRUCTIONS );
-EVENT_ATTR(cache-references, CACHE_REFERENCES );
-EVENT_ATTR(cache-misses, CACHE_MISSES );
-EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
-EVENT_ATTR(branch-misses, BRANCH_MISSES );
-EVENT_ATTR(bus-cycles, BUS_CYCLES );
-EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
-EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
-EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
- EVENT_PTR(CPU_CYCLES),
- EVENT_PTR(INSTRUCTIONS),
- EVENT_PTR(CACHE_REFERENCES),
- EVENT_PTR(CACHE_MISSES),
- EVENT_PTR(BRANCH_INSTRUCTIONS),
- EVENT_PTR(BRANCH_MISSES),
- EVENT_PTR(BUS_CYCLES),
- EVENT_PTR(STALLED_CYCLES_FRONTEND),
- EVENT_PTR(STALLED_CYCLES_BACKEND),
- EVENT_PTR(REF_CPU_CYCLES),
- NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
- .name = "events",
- .attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
- u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
- bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
- bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
- bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
- bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
- ssize_t ret;
-
- /*
- * We have whole page size to spend and just little data
- * to write, so we can safely use sprintf.
- */
- ret = sprintf(page, "event=0x%02llx", event);
-
- if (umask)
- ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
- if (edge)
- ret += sprintf(page + ret, ",edge");
-
- if (pc)
- ret += sprintf(page + ret, ",pc");
-
- if (any)
- ret += sprintf(page + ret, ",any");
-
- if (inv)
- ret += sprintf(page + ret, ",inv");
-
- if (cmask)
- ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
- ret += sprintf(page + ret, "\n");
-
- return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
- struct x86_pmu_quirk *quirk;
- int err;
-
- pr_info("Performance Events: ");
-
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- err = intel_pmu_init();
- break;
- case X86_VENDOR_AMD:
- err = amd_pmu_init();
- break;
- default:
- return 0;
- }
- if (err != 0) {
- pr_cont("no PMU driver, software events only.\n");
- return 0;
- }
-
- pmu_check_apic();
-
- /* sanity check that the hardware exists or is emulated */
- if (!check_hw_exists())
- return 0;
-
- pr_cont("%s PMU driver.\n", x86_pmu.name);
-
- for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
- quirk->func();
-
- if (!x86_pmu.intel_ctrl)
- x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
- perf_events_lapic_init();
- register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
- unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters, 0, 0);
-
- x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
- x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
- if (x86_pmu.event_attrs)
- x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
- if (!x86_pmu.events_sysfs_show)
- x86_pmu_events_group.attrs = &empty_attrs;
- else
- filter_events(x86_pmu_events_group.attrs);
-
- if (x86_pmu.cpu_events) {
- struct attribute **tmp;
-
- tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
- if (!WARN_ON(!tmp))
- x86_pmu_events_group.attrs = tmp;
- }
-
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
- pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
-
- perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
- perf_cpu_notifier(x86_pmu_notifier);
-
- return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
- x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- */
-static void x86_pmu_start_txn(struct pmu *pmu)
-{
- perf_pmu_disable(pmu);
- __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
- __this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
- __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
- /*
- * Truncate the collected events.
- */
- __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
- __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
- perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int assign[X86_PMC_IDX_MAX];
- int n, ret;
-
- n = cpuc->n_events;
-
- if (!x86_pmu_initialized())
- return -EAGAIN;
-
- ret = x86_pmu.schedule_events(cpuc, n, assign);
- if (ret)
- return ret;
-
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n*sizeof(int));
-
- cpuc->group_flag &= ~PERF_EVENT_TXN;
- perf_pmu_enable(pmu);
- return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
- kfree(cpuc->shared_regs);
- kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
- struct cpu_hw_events *cpuc;
- int cpu = raw_smp_processor_id();
-
- cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
- if (!cpuc)
- return ERR_PTR(-ENOMEM);
-
- /* only needed, if we have extra_regs */
- if (x86_pmu.extra_regs) {
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- goto error;
- }
- cpuc->is_fake = 1;
- return cpuc;
-error:
- free_fake_cpuc(cpuc);
- return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
- struct cpu_hw_events *fake_cpuc;
- struct event_constraint *c;
- int ret = 0;
-
- fake_cpuc = allocate_fake_cpuc();
- if (IS_ERR(fake_cpuc))
- return PTR_ERR(fake_cpuc);
-
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
-
- if (!c || !c->weight)
- ret = -EINVAL;
-
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(fake_cpuc, event);
-
- free_fake_cpuc(fake_cpuc);
-
- return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- * - check events are compatible which each other
- * - events do not compete for the same counter
- * - number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
- struct perf_event *leader = event->group_leader;
- struct cpu_hw_events *fake_cpuc;
- int ret = -EINVAL, n;
-
- fake_cpuc = allocate_fake_cpuc();
- if (IS_ERR(fake_cpuc))
- return PTR_ERR(fake_cpuc);
- /*
- * the event is not yet connected with its
- * siblings therefore we must first collect
- * existing siblings, then add the new event
- * before we can simulate the scheduling
- */
- n = collect_events(fake_cpuc, leader, true);
- if (n < 0)
- goto out;
-
- fake_cpuc->n_events = n;
- n = collect_events(fake_cpuc, event, false);
- if (n < 0)
- goto out;
-
- fake_cpuc->n_events = n;
-
- ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
- free_fake_cpuc(fake_cpuc);
- return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
- struct pmu *tmp;
- int err;
-
- switch (event->attr.type) {
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- case PERF_TYPE_HW_CACHE:
- break;
-
- default:
- return -ENOENT;
- }
-
- err = __x86_pmu_event_init(event);
- if (!err) {
- /*
- * we temporarily connect event to its pmu
- * such that validate_group() can classify
- * it as an x86 event using is_x86_event()
- */
- tmp = event->pmu;
- event->pmu = &pmu;
-
- if (event->group_leader != event)
- err = validate_group(event);
- else
- err = validate_event(event);
-
- event->pmu = tmp;
- }
- if (err) {
- if (event->destroy)
- event->destroy(event);
- }
-
- return err;
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
- int idx = event->hw.idx;
-
- if (!x86_pmu.attr_rdpmc)
- return 0;
-
- if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
- idx -= INTEL_PMC_IDX_FIXED;
- idx |= 1 << 30;
- }
-
- return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
- struct device_attribute *attr,
- char *buf)
-{
- return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static void change_rdpmc(void *info)
-{
- bool enable = !!(unsigned long)info;
-
- if (enable)
- set_in_cr4(X86_CR4_PCE);
- else
- clear_in_cr4(X86_CR4_PCE);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned long val;
- ssize_t ret;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret)
- return ret;
-
- if (!!val != !!x86_pmu.attr_rdpmc) {
- x86_pmu.attr_rdpmc = !!val;
- smp_call_function(change_rdpmc, (void *)val, 1);
- }
-
- return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
- &dev_attr_rdpmc.attr,
- NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
- .attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
- &x86_pmu_attr_group,
- &x86_pmu_format_group,
- &x86_pmu_events_group,
- NULL,
-};
-
-static void x86_pmu_flush_branch_stack(void)
-{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
-}
-
-void perf_check_microcode(void)
-{
- if (x86_pmu.check_microcode)
- x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
- .pmu_enable = x86_pmu_enable,
- .pmu_disable = x86_pmu_disable,
-
- .attr_groups = x86_pmu_attr_groups,
-
- .event_init = x86_pmu_event_init,
-
- .add = x86_pmu_add,
- .del = x86_pmu_del,
- .start = x86_pmu_start,
- .stop = x86_pmu_stop,
- .read = x86_pmu_read,
-
- .start_txn = x86_pmu_start_txn,
- .cancel_txn = x86_pmu_cancel_txn,
- .commit_txn = x86_pmu_commit_txn,
-
- .event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
-};
-
-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
-{
- userpg->cap_usr_time = 0;
- userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
- userpg->pmc_width = x86_pmu.cntval_bits;
-
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
- return;
-
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- return;
-
- userpg->cap_usr_time = 1;
- userpg->time_mult = this_cpu_read(cyc2ns);
- userpg->time_shift = CYC2NS_SCALE_FACTOR;
- userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
- return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct perf_callchain_entry *entry = data;
-
- perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return;
- }
-
- perf_callchain_store(entry, regs->ip);
-
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
- return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
- struct desc_struct *desc;
- int idx = segment >> 3;
-
- if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
- if (idx > LDT_ENTRIES)
- return 0;
-
- if (idx > current->active_mm->context.size)
- return 0;
-
- desc = current->active_mm->context.ldt;
- } else {
- if (idx > GDT_ENTRIES)
- return 0;
-
- desc = __this_cpu_ptr(&gdt_page.gdt[0]);
- }
-
- return get_desc_base(desc + idx);
-}
-
-#ifdef CONFIG_COMPAT
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- /* 32-bit process in 64-bit kernel. */
- unsigned long ss_base, cs_base;
- struct stack_frame_ia32 frame;
- const void __user *fp;
-
- if (!test_thread_flag(TIF_IA32))
- return 0;
-
- cs_base = get_segment_base(regs->cs);
- ss_base = get_segment_base(regs->ss);
-
- fp = compat_ptr(ss_base + regs->bp);
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- unsigned long bytes;
- frame.next_frame = 0;
- frame.return_address = 0;
-
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
- break;
-
- if (!valid_user_frame(fp, sizeof(frame)))
- break;
-
- perf_callchain_store(entry, cs_base + frame.return_address);
- fp = compat_ptr(ss_base + frame.next_frame);
- }
- return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
- return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
- struct stack_frame frame;
- const void __user *fp;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- /* TODO: We don't support guest os callchain now */
- return;
- }
-
- /*
- * We don't know what to do with VM86 stacks.. ignore them for now.
- */
- if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
- return;
-
- fp = (void __user *)regs->bp;
-
- perf_callchain_store(entry, regs->ip);
-
- if (!current->mm)
- return;
-
- if (perf_callchain_user32(regs, entry))
- return;
-
- while (entry->nr < PERF_MAX_STACK_DEPTH) {
- unsigned long bytes;
- frame.next_frame = NULL;
- frame.return_address = 0;
-
- bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
- break;
-
- if (!valid_user_frame(fp, sizeof(frame)))
- break;
-
- perf_callchain_store(entry, frame.return_address);
- fp = frame.next_frame;
- }
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- * VM86 - the good olde 16 bit days, where the linear address is
- * 20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- * IA32 - Where we need to look at GDT/LDT segment descriptor tables
- * to figure out what the 32bit base address is.
- *
- * X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
- /*
- * If we are in VM86 mode, add the segment offset to convert to a
- * linear address.
- */
- if (regs->flags & X86_VM_MASK)
- return 0x10 * regs->cs;
-
- /*
- * For IA32 we look at the GDT/LDT segment base to convert the
- * effective IP to a linear address.
- */
-#ifdef CONFIG_X86_32
- if (user_mode(regs) && regs->cs != __USER_CS)
- return get_segment_base(regs->cs);
-#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
-#endif
- return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
- return perf_guest_cbs->get_guest_ip();
-
- return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
- int misc = 0;
-
- if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
- if (perf_guest_cbs->is_user_mode())
- misc |= PERF_RECORD_MISC_GUEST_USER;
- else
- misc |= PERF_RECORD_MISC_GUEST_KERNEL;
- } else {
- if (user_mode(regs))
- misc |= PERF_RECORD_MISC_USER;
- else
- misc |= PERF_RECORD_MISC_KERNEL;
- }
-
- if (regs->flags & PERF_EFLAGS_EXACT)
- misc |= PERF_RECORD_MISC_EXACT_IP;
-
- return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
- cap->version = x86_pmu.version;
- cap->num_counters_gp = x86_pmu.num_counters;
- cap->num_counters_fixed = x86_pmu.num_counters_fixed;
- cap->bit_width_gp = x86_pmu.cntval_bits;
- cap->bit_width_fixed = x86_pmu.cntval_bits;
- cap->events_mask = (unsigned int)x86_pmu.events_maskl;
- cap->events_mask_len = x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
deleted file mode 100644
index 97e557bc4c91..000000000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,717 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2009 Jaswinder Singh Rajput
- * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- * Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) \
-do { \
- unsigned int _msr = (msr); \
- u64 _val = (val); \
- trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
- (unsigned long long)(_val)); \
- native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
-} while (0)
-#endif
-
-/*
- * | NHM/WSM | SNB |
- * register -------------------------------
- * | HT | no HT | HT | no HT |
- *-----------------------------------------
- * offcore | core | core | cpu | core |
- * lbr_sel | core | core | cpu | core |
- * ld_lat | cpu | core | cpu | core |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
- EXTRA_REG_NONE = -1, /* not used */
-
- EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
- EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- EXTRA_REG_LBR = 2, /* lbr_select */
- EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
-
- EXTRA_REG_MAX /* number of entries needed */
-};
-
-struct event_constraint {
- union {
- unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- u64 idxmsk64;
- };
- u64 code;
- u64 cmask;
- int weight;
- int overlap;
- int flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */
-#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
-
-struct amd_nb {
- int nb_id; /* NorthBridge id */
- int refcnt; /* reference count */
- struct perf_event *owners[X86_PMC_IDX_MAX];
- struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 8
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
- raw_spinlock_t lock; /* per-core: protect structure */
- u64 config; /* extra MSR config */
- u64 reg; /* extra MSR number */
- atomic_t ref; /* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
- struct er_account regs[EXTRA_REG_MAX];
- int refcnt; /* per-core: #HT threads */
- unsigned core_id; /* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES 16
-
-struct cpu_hw_events {
- /*
- * Generic x86 PMC bits
- */
- struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
- unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int enabled;
-
- int n_events;
- int n_added;
- int n_txn;
- int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
- u64 tags[X86_PMC_IDX_MAX];
- struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-
- unsigned int group_flag;
- int is_fake;
-
- /*
- * Intel DebugStore bits
- */
- struct debug_store *ds;
- u64 pebs_enabled;
-
- /*
- * Intel LBR bits
- */
- int lbr_users;
- void *lbr_context;
- struct perf_branch_stack lbr_stack;
- struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
- struct er_account *lbr_sel;
- u64 br_sel;
-
- /*
- * Intel host/guest exclude bits
- */
- u64 intel_ctrl_guest_mask;
- u64 intel_ctrl_host_mask;
- struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
-
- /*
- * manage shared (per-core, per-cpu) registers
- * used on Intel NHM/WSM/SNB
- */
- struct intel_shared_regs *shared_regs;
-
- /*
- * AMD specific bits
- */
- struct amd_nb *amd_nb;
- /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
- u64 perf_ctr_virt_mask;
-
- void *kfree_on_online;
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
- { .idxmsk64 = (n) }, \
- .code = (c), \
- .cmask = (m), \
- .weight = (w), \
- .overlap = (o), \
- .flags = f, \
-}
-
-#define EVENT_CONSTRAINT(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- * - inv
- * - edge
- * - cnt-mask
- * - in_tx
- * - in_tx_checkpointed
- * The other filters are supported by fixed counters.
- * The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-#define INTEL_PLD_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
- HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
- HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* DataLA version of store sampling without extra enable bit. */
-#define INTEL_PST_HSW_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
- HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define EVENT_CONSTRAINT_END \
- EVENT_CONSTRAINT(0, 0, 0)
-
-#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->weight; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
- unsigned int event;
- unsigned int msr;
- u64 config_mask;
- u64 valid_mask;
- int idx; /* per_xxx->regs[] reg index */
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
- }
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
- EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
- EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
- ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
- INTEL_UEVENT_EXTRA_REG(c, \
- MSR_PEBS_LD_LAT_THRESHOLD, \
- 0xffff, \
- LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
- struct {
- u64 lbr_format:6;
- u64 pebs_trap:1;
- u64 pebs_arch_reg:1;
- u64 pebs_format:4;
- u64 smm_freeze:1;
- /*
- * PMU supports separate counter range for writing
- * values > 32bit.
- */
- u64 full_width_write:1;
- };
- u64 capabilities;
-};
-
-struct x86_pmu_quirk {
- struct x86_pmu_quirk *next;
- void (*func)(void);
-};
-
-union x86_pmu_config {
- struct {
- u64 event:8,
- umask:8,
- usr:1,
- os:1,
- edge:1,
- pc:1,
- interrupt:1,
- __reserved1:1,
- en:1,
- inv:1,
- cmask:8,
- event2:4,
- __reserved2:4,
- go:1,
- ho:1;
- } bits;
- u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
- /*
- * Generic x86 PMC bits
- */
- const char *name;
- int version;
- int (*handle_irq)(struct pt_regs *);
- void (*disable_all)(void);
- void (*enable_all)(int added);
- void (*enable)(struct perf_event *);
- void (*disable)(struct perf_event *);
- int (*hw_config)(struct perf_event *event);
- int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
- unsigned eventsel;
- unsigned perfctr;
- int (*addr_offset)(int index, bool eventsel);
- int (*rdpmc_index)(int index);
- u64 (*event_map)(int);
- int max_events;
- int num_counters;
- int num_counters_fixed;
- int cntval_bits;
- u64 cntval_mask;
- union {
- unsigned long events_maskl;
- unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
- };
- int events_mask_len;
- int apic;
- u64 max_period;
- struct event_constraint *
- (*get_event_constraints)(struct cpu_hw_events *cpuc,
- struct perf_event *event);
-
- void (*put_event_constraints)(struct cpu_hw_events *cpuc,
- struct perf_event *event);
- struct event_constraint *event_constraints;
- struct x86_pmu_quirk *quirks;
- int perfctr_second_write;
- bool late_ack;
-
- /*
- * sysfs attrs
- */
- int attr_rdpmc;
- struct attribute **format_attrs;
- struct attribute **event_attrs;
-
- ssize_t (*events_sysfs_show)(char *page, u64 config);
- struct attribute **cpu_events;
-
- /*
- * CPU Hotplug hooks
- */
- int (*cpu_prepare)(int cpu);
- void (*cpu_starting)(int cpu);
- void (*cpu_dying)(int cpu);
- void (*cpu_dead)(int cpu);
-
- void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
-
- /*
- * Intel Arch Perfmon v2+
- */
- u64 intel_ctrl;
- union perf_capabilities intel_cap;
-
- /*
- * Intel DebugStore bits
- */
- unsigned int bts :1,
- bts_active :1,
- pebs :1,
- pebs_active :1,
- pebs_broken :1;
- int pebs_record_size;
- void (*drain_pebs)(struct pt_regs *regs);
- struct event_constraint *pebs_constraints;
- void (*pebs_aliases)(struct perf_event *event);
- int max_pebs_events;
-
- /*
- * Intel LBR
- */
- unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
- int lbr_nr; /* hardware stack size */
- u64 lbr_sel_mask; /* LBR_SELECT valid bits */
- const int *lbr_sel_map; /* lbr_select mappings */
-
- /*
- * Extra registers for events
- */
- struct extra_reg *extra_regs;
- unsigned int er_flags;
-
- /*
- * Intel host/guest support (KVM)
- */
- struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-#define x86_add_quirk(func_) \
-do { \
- static struct x86_pmu_quirk __quirk __initdata = { \
- .func = func_, \
- }; \
- __quirk.next = x86_pmu.quirks; \
- x86_pmu.quirks = &__quirk; \
-} while (0)
-
-#define ERF_NO_HT_SHARING 1
-#define ERF_HAS_RSP_1 2
-
-#define EVENT_VAR(_id) event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id) \
-static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
- .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
- .id = PERF_COUNT_HW_##_id, \
- .event_str = NULL, \
-};
-
-#define EVENT_ATTR_STR(_name, v, str) \
-static struct perf_pmu_events_attr event_attr_##v = { \
- .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
- .id = 0, \
- .event_str = str, \
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
- return x86_pmu.eventsel + (x86_pmu.addr_offset ?
- x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
- return x86_pmu.perfctr + (x86_pmu.addr_offset ?
- x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
- return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
- u64 enable_mask)
-{
- u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
- if (hwc->extra_reg.reg)
- wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
- wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct perf_event **events, int n,
- int wmin, int wmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
- return ip > PAGE_OFFSET;
-#else
- return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
- regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
- if (regs->flags & X86_VM_MASK)
- regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
- regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
- return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(void);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
- char *page);
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
- return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
- return NULL;
-}
-
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
deleted file mode 100644
index 4cbe03287b08..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,729 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
- [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
- [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
- [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
- [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
- [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
- [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
- [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
- return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- * 4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- * 6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
- int offset;
-
- if (!index)
- return index;
-
- if (eventsel)
- offset = event_offsets[index];
- else
- offset = count_offsets[index];
-
- if (offset)
- return offset;
-
- if (!cpu_has_perfctr_core)
- offset = index;
- else
- offset = index << 1;
-
- if (eventsel)
- event_offsets[index] = offset;
- else
- count_offsets[index] = offset;
-
- return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
- if (event->attr.exclude_host && event->attr.exclude_guest)
- /*
- * When HO == GO == 1 the hardware treats that as GO == HO == 0
- * and will count in both modes. We don't want to count in that
- * case so we emulate no-counting by setting US = OS = 0.
- */
- event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
- ARCH_PERFMON_EVENTSEL_OS);
- else if (event->attr.exclude_host)
- event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
- else if (event->attr.exclude_guest)
- event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
- return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
- return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
- return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
- struct amd_nb *nb = cpuc->amd_nb;
-
- return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
- int ret;
-
- /* pass precise event sampling to ibs: */
- if (event->attr.precise_ip && get_ibs_caps())
- return -ENOENT;
-
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- ret = x86_pmu_hw_config(event);
- if (ret)
- return ret;
-
- if (event->attr.type == PERF_TYPE_RAW)
- event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
- return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- struct amd_nb *nb = cpuc->amd_nb;
- int i;
-
- /*
- * need to scan whole list because event may not have
- * been assigned during scheduling
- *
- * no race condition possible because event can only
- * be removed on one CPU at a time AND PMU is disabled
- * when we come here
- */
- for (i = 0; i < x86_pmu.num_counters; i++) {
- if (cmpxchg(nb->owners + i, event, NULL) == event)
- break;
- }
-}
-
- /*
- * AMD64 NorthBridge events need special treatment because
- * counter access needs to be synchronized across all cores
- * of a package. Refer to BKDG section 3.12
- *
- * NB events are events measuring L3 cache, Hypertransport
- * traffic. They are identified by an event code >= 0xe00.
- * They measure events on the NorthBride which is shared
- * by all cores on a package. NB events are counted on a
- * shared set of counters. When a NB event is programmed
- * in a counter, the data actually comes from a shared
- * counter. Thus, access to those counters needs to be
- * synchronized.
- *
- * We implement the synchronization such that no two cores
- * can be measuring NB events using the same counters. Thus,
- * we maintain a per-NB allocation table. The available slot
- * is propagated using the event_constraint structure.
- *
- * We provide only one choice for each NB event based on
- * the fact that only NB events have restrictions. Consequently,
- * if a counter is available, there is a guarantee the NB event
- * will be assigned to it. If no slot is available, an empty
- * constraint is returned and scheduling will eventually fail
- * for this event.
- *
- * Note that all cores attached the same NB compete for the same
- * counters to host NB events, this is why we use atomic ops. Some
- * multi-chip CPUs may have more than one NB.
- *
- * Given that resources are allocated (cmpxchg), they must be
- * eventually freed for others to use. This is accomplished by
- * calling __amd_put_nb_event_constraints()
- *
- * Non NB events are not impacted by this restriction.
- */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
- struct event_constraint *c)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct amd_nb *nb = cpuc->amd_nb;
- struct perf_event *old;
- int idx, new = -1;
-
- if (!c)
- c = &unconstrained;
-
- if (cpuc->is_fake)
- return c;
-
- /*
- * detect if already present, if so reuse
- *
- * cannot merge with actual allocation
- * because of possible holes
- *
- * event can already be present yet not assigned (in hwc->idx)
- * because of successive calls to x86_schedule_events() from
- * hw_perf_group_sched_in() without hw_perf_enable()
- */
- for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
- if (new == -1 || hwc->idx == idx)
- /* assign free slot, prefer hwc->idx */
- old = cmpxchg(nb->owners + idx, NULL, event);
- else if (nb->owners[idx] == event)
- /* event already present */
- old = event;
- else
- continue;
-
- if (old && old != event)
- continue;
-
- /* reassign to this slot */
- if (new != -1)
- cmpxchg(nb->owners + new, event, NULL);
- new = idx;
-
- /* already present, reuse */
- if (old == event)
- break;
- }
-
- if (new == -1)
- return &emptyconstraint;
-
- return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
- struct amd_nb *nb;
- int i;
-
- nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
- cpu_to_node(cpu));
- if (!nb)
- return NULL;
-
- nb->nb_id = -1;
-
- /*
- * initialize all possible NB constraints
- */
- for (i = 0; i < x86_pmu.num_counters; i++) {
- __set_bit(i, nb->event_constraints[i].idxmsk);
- nb->event_constraints[i].weight = 1;
- }
- return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
- WARN_ON_ONCE(cpuc->amd_nb);
-
- if (boot_cpu_data.x86_max_cores < 2)
- return NOTIFY_OK;
-
- cpuc->amd_nb = amd_alloc_nb(cpu);
- if (!cpuc->amd_nb)
- return NOTIFY_BAD;
-
- return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct amd_nb *nb;
- int i, nb_id;
-
- cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
- if (boot_cpu_data.x86_max_cores < 2)
- return;
-
- nb_id = amd_get_nb_id(cpu);
- WARN_ON_ONCE(nb_id == BAD_APICID);
-
- for_each_online_cpu(i) {
- nb = per_cpu(cpu_hw_events, i).amd_nb;
- if (WARN_ON_ONCE(!nb))
- continue;
-
- if (nb->nb_id == nb_id) {
- cpuc->kfree_on_online = cpuc->amd_nb;
- cpuc->amd_nb = nb;
- break;
- }
- }
-
- cpuc->amd_nb->nb_id = nb_id;
- cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
- struct cpu_hw_events *cpuhw;
-
- if (boot_cpu_data.x86_max_cores < 2)
- return;
-
- cpuhw = &per_cpu(cpu_hw_events, cpu);
-
- if (cpuhw->amd_nb) {
- struct amd_nb *nb = cpuhw->amd_nb;
-
- if (nb->nb_id == -1 || --nb->refcnt == 0)
- kfree(nb);
-
- cpuhw->amd_nb = NULL;
- }
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- /*
- * if not NB event or no NB, then no constraints
- */
- if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
- return &unconstrained;
-
- return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
- __amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15" );
-PMU_FORMAT_ATTR(edge, "config:18" );
-PMU_FORMAT_ATTR(inv, "config:23" );
-PMU_FORMAT_ATTR(cmask, "config:24-31" );
-
-static struct attribute *amd_format_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask.attr,
- NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
-
-#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS 0x000000C0ULL
-#define AMD_EVENT_DE 0x000000D0ULL
-#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000 FP PERF_CTL[5:3]
- * 0x010 FP PERF_CTL[5:3]
- * 0x020 LS PERF_CTL[5:0]
- * 0x030 LS PERF_CTL[5:0]
- * 0x040 DC PERF_CTL[5:0]
- * 0x050 DC PERF_CTL[5:0]
- * 0x060 CU PERF_CTL[2:0]
- * 0x070 CU PERF_CTL[2:0]
- * 0x080 IC/DE PERF_CTL[2:0]
- * 0x090 IC/DE PERF_CTL[2:0]
- * 0x0A0 ---
- * 0x0B0 ---
- * 0x0C0 EX/LS PERF_CTL[5:0]
- * 0x0D0 DE PERF_CTL[2:0]
- * 0x0E0 NB NB_PERF_CTL[3:0]
- * 0x0F0 NB NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003 FP PERF_CTL[3]
- * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B FP PERF_CTL[3]
- * 0x00D FP PERF_CTL[3]
- * 0x023 DE PERF_CTL[2:0]
- * 0x02D LS PERF_CTL[3]
- * 0x02E LS PERF_CTL[3,0]
- * 0x031 LS PERF_CTL[2:0] (**)
- * 0x043 CU PERF_CTL[2:0]
- * 0x045 CU PERF_CTL[2:0]
- * 0x046 CU PERF_CTL[2:0]
- * 0x054 CU PERF_CTL[2:0]
- * 0x055 CU PERF_CTL[2:0]
- * 0x08F IC PERF_CTL[0]
- * 0x187 DE PERF_CTL[0]
- * 0x188 DE PERF_CTL[0]
- * 0x0DB EX PERF_CTL[5:0]
- * 0x0DC LS PERF_CTL[5:0]
- * 0x0DD LS PERF_CTL[5:0]
- * 0x0DE LS PERF_CTL[5:0]
- * 0x0DF LS PERF_CTL[5:0]
- * 0x1C0 EX PERF_CTL[5:3]
- * 0x1D6 EX PERF_CTL[5:0]
- * 0x1D8 EX PERF_CTL[5:0]
- *
- * (*) depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int event_code = amd_get_event_code(hwc);
-
- switch (event_code & AMD_EVENT_TYPE_MASK) {
- case AMD_EVENT_FP:
- switch (event_code) {
- case 0x000:
- if (!(hwc->config & 0x0000F000ULL))
- break;
- if (!(hwc->config & 0x00000F00ULL))
- break;
- return &amd_f15_PMC3;
- case 0x004:
- if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
- break;
- return &amd_f15_PMC3;
- case 0x003:
- case 0x00B:
- case 0x00D:
- return &amd_f15_PMC3;
- }
- return &amd_f15_PMC53;
- case AMD_EVENT_LS:
- case AMD_EVENT_DC:
- case AMD_EVENT_EX_LS:
- switch (event_code) {
- case 0x023:
- case 0x043:
- case 0x045:
- case 0x046:
- case 0x054:
- case 0x055:
- return &amd_f15_PMC20;
- case 0x02D:
- return &amd_f15_PMC3;
- case 0x02E:
- return &amd_f15_PMC30;
- case 0x031:
- if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
- return &amd_f15_PMC20;
- return &emptyconstraint;
- case 0x1C0:
- return &amd_f15_PMC53;
- default:
- return &amd_f15_PMC50;
- }
- case AMD_EVENT_CU:
- case AMD_EVENT_IC_DE:
- case AMD_EVENT_DE:
- switch (event_code) {
- case 0x08F:
- case 0x187:
- case 0x188:
- return &amd_f15_PMC0;
- case 0x0DB ... 0x0DF:
- case 0x1D6:
- case 0x1D8:
- return &amd_f15_PMC50;
- default:
- return &amd_f15_PMC20;
- }
- case AMD_EVENT_NB:
- /* moved to perf_event_amd_uncore.c */
- return &emptyconstraint;
- default:
- return &emptyconstraint;
- }
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
- u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
- (config & AMD64_EVENTSEL_EVENT) >> 24;
-
- return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
- .name = "AMD",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = x86_pmu_enable_all,
- .enable = x86_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = amd_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_K7_EVNTSEL0,
- .perfctr = MSR_K7_PERFCTR0,
- .addr_offset = amd_pmu_addr_offset,
- .event_map = amd_pmu_event_map,
- .max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_counters = AMD64_NUM_COUNTERS,
- .cntval_bits = 48,
- .cntval_mask = (1ULL << 48) - 1,
- .apic = 1,
- /* use highest bit to detect overflow */
- .max_period = (1ULL << 47) - 1,
- .get_event_constraints = amd_get_event_constraints,
- .put_event_constraints = amd_put_event_constraints,
-
- .format_attrs = amd_format_attr,
- .events_sysfs_show = amd_event_sysfs_show,
-
- .cpu_prepare = amd_pmu_cpu_prepare,
- .cpu_starting = amd_pmu_cpu_starting,
- .cpu_dead = amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
- if (!cpu_has_perfctr_core)
- return 0;
-
- switch (boot_cpu_data.x86) {
- case 0x15:
- pr_cont("Fam15h ");
- x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
- break;
-
- default:
- pr_err("core perfctr but no constraints; unknown hardware!\n");
- return -ENODEV;
- }
-
- /*
- * If core performance counter extensions exists, we must use
- * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
- * amd_pmu_addr_offset().
- */
- x86_pmu.eventsel = MSR_F15H_PERF_CTL;
- x86_pmu.perfctr = MSR_F15H_PERF_CTR;
- x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
-
- pr_cont("core perfctr, ");
- return 0;
-}
-
-__init int amd_pmu_init(void)
-{
- int ret;
-
- /* Performance-monitoring supported from K7 and later: */
- if (boot_cpu_data.x86 < 6)
- return -ENODEV;
-
- x86_pmu = amd_pmu;
-
- ret = amd_core_pmu_init();
- if (ret)
- return ret;
-
- /* Events are common for all AMDs */
- memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- cpuc->perf_ctr_virt_mask = 0;
-
- /* Reload all events */
- x86_pmu_disable_all();
- x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- /*
- * We only mask out the Host-only bit so that host-only counting works
- * when SVM is disabled. If someone sets up a guest-only counter when
- * SVM is disabled the Guest-only bits still gets set and the counter
- * will not count anything.
- */
- cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
- /* Reload all events */
- x86_pmu_disable_all();
- x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
deleted file mode 100644
index 5f0581e713c2..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
-
-enum ibs_states {
- IBS_ENABLED = 0,
- IBS_STARTED = 1,
- IBS_STOPPING = 2,
-
- IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
- struct perf_event *event;
- unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
- struct pmu pmu;
- unsigned int msr;
- u64 config_mask;
- u64 cnt_mask;
- u64 enable_mask;
- u64 valid_mask;
- u64 max_period;
- unsigned long offset_mask[1];
- int offset_max;
- struct cpu_perf_ibs __percpu *pcpu;
-
- struct attribute **format_attrs;
- struct attribute_group format_group;
- const struct attribute_group *attr_groups[2];
-
- u64 (*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
- u32 size;
- union {
- u32 data[0]; /* data buffer starts here */
- u32 caps;
- };
- u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
- s64 left = local64_read(&hwc->period_left);
- s64 period = hwc->sample_period;
- int overflow = 0;
-
- /*
- * If we are way outside a reasonable range then just skip forward:
- */
- if (unlikely(left <= -period)) {
- left = period;
- local64_set(&hwc->period_left, left);
- hwc->last_period = period;
- overflow = 1;
- }
-
- if (unlikely(left < (s64)min)) {
- left += period;
- local64_set(&hwc->period_left, left);
- hwc->last_period = period;
- overflow = 1;
- }
-
- /*
- * If the hw period that triggers the sw overflow is too short
- * we might hit the irq handler. This biases the results.
- * Thus we shorten the next-to-last period and set the last
- * period to the max period.
- */
- if (left > max) {
- left -= max;
- if (left > max)
- left = max;
- else if (left < min)
- left = min;
- }
-
- *hw_period = (u64)left;
-
- return overflow;
-}
-
-static int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
- struct hw_perf_event *hwc = &event->hw;
- int shift = 64 - width;
- u64 prev_raw_count;
- u64 delta;
-
- /*
- * Careful: an NMI might modify the previous event value.
- *
- * Our tactic to handle this is to first atomically read and
- * exchange a new raw count - then add that new-prev delta
- * count to the generic event atomically:
- */
- prev_raw_count = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- new_raw_count) != prev_raw_count)
- return 0;
-
- /*
- * Now we have the new raw value and have updated the prev
- * timestamp already. We can now calculate the elapsed delta
- * (event-)time and add that to the generic event.
- *
- * Careful, not all hw sign-extends above the physical width
- * of the count.
- */
- delta = (new_raw_count << shift) - (prev_raw_count << shift);
- delta >>= shift;
-
- local64_add(delta, &event->count);
- local64_sub(delta, &hwc->period_left);
-
- return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
- if (perf_ibs_fetch.pmu.type == type)
- return &perf_ibs_fetch;
- if (perf_ibs_op.pmu.type == type)
- return &perf_ibs_op;
- return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
- * perf record -a -e r076:p ... # same as -e cpu-cycles:p
- * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
- switch (event->attr.precise_ip) {
- case 0:
- return -ENOENT;
- case 1:
- case 2:
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- switch (event->attr.type) {
- case PERF_TYPE_HARDWARE:
- switch (event->attr.config) {
- case PERF_COUNT_HW_CPU_CYCLES:
- *config = 0;
- return 0;
- }
- break;
- case PERF_TYPE_RAW:
- switch (event->attr.config) {
- case 0x0076:
- *config = 0;
- return 0;
- case 0x00C1:
- *config = IBS_OP_CNT_CTL;
- return 0;
- }
- break;
- default:
- return -ENOENT;
- }
-
- return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
- .exclude_user = 1,
- .exclude_kernel = 1,
- .exclude_hv = 1,
- .exclude_idle = 1,
- .exclude_host = 1,
- .exclude_guest = 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct perf_ibs *perf_ibs;
- u64 max_cnt, config;
- int ret;
-
- perf_ibs = get_ibs_pmu(event->attr.type);
- if (perf_ibs) {
- config = event->attr.config;
- } else {
- perf_ibs = &perf_ibs_op;
- ret = perf_ibs_precise_event(event, &config);
- if (ret)
- return ret;
- }
-
- if (event->pmu != &perf_ibs->pmu)
- return -ENOENT;
-
- if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
- return -EINVAL;
-
- if (config & ~perf_ibs->config_mask)
- return -EINVAL;
-
- if (hwc->sample_period) {
- if (config & perf_ibs->cnt_mask)
- /* raw max_cnt may not be set */
- return -EINVAL;
- if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
- /*
- * lower 4 bits can not be set in ibs max cnt,
- * but allowing it in case we adjust the
- * sample period to set a frequency.
- */
- return -EINVAL;
- hwc->sample_period &= ~0x0FULL;
- if (!hwc->sample_period)
- hwc->sample_period = 0x10;
- } else {
- max_cnt = config & perf_ibs->cnt_mask;
- config &= ~perf_ibs->cnt_mask;
- event->attr.sample_period = max_cnt << 4;
- hwc->sample_period = event->attr.sample_period;
- }
-
- if (!hwc->sample_period)
- return -EINVAL;
-
- /*
- * If we modify hwc->sample_period, we also need to update
- * hwc->last_period and hwc->period_left.
- */
- hwc->last_period = hwc->sample_period;
- local64_set(&hwc->period_left, hwc->sample_period);
-
- hwc->config_base = perf_ibs->msr;
- hwc->config = config;
-
- return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
- struct hw_perf_event *hwc, u64 *period)
-{
- int overflow;
-
- /* ignore lower 4 bits in min count: */
- overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
- local64_set(&hwc->prev_count, 0);
-
- return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
- return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
- u64 count = 0;
-
- if (config & IBS_OP_VAL)
- count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
- if (ibs_caps & IBS_CAPS_RDWROPCNT)
- count += (config & IBS_OP_CUR_CNT) >> 32;
-
- return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
- u64 *config)
-{
- u64 count = perf_ibs->get_count(*config);
-
- /*
- * Set width to 64 since we do not overflow on max width but
- * instead on max count. In perf_ibs_set_period() we clear
- * prev count manually on overflow.
- */
- while (!perf_event_try_update(event, count, 64)) {
- rdmsrl(event->hw.config_base, *config);
- count = perf_ibs->get_count(*config);
- }
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
- struct hw_perf_event *hwc, u64 config)
-{
- wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
- struct hw_perf_event *hwc, u64 config)
-{
- config &= ~perf_ibs->cnt_mask;
- wrmsrl(hwc->config_base, config);
- config &= ~perf_ibs->enable_mask;
- wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
- struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
- u64 period;
-
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
- hwc->state = 0;
-
- perf_ibs_set_period(perf_ibs, hwc, &period);
- set_bit(IBS_STARTED, pcpu->state);
- perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
- perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
- struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
- u64 config;
- int stopping;
-
- stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
- if (!stopping && (hwc->state & PERF_HES_UPTODATE))
- return;
-
- rdmsrl(hwc->config_base, config);
-
- if (stopping) {
- set_bit(IBS_STOPPING, pcpu->state);
- perf_ibs_disable_event(perf_ibs, hwc, config);
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
- }
-
- if (hwc->state & PERF_HES_UPTODATE)
- return;
-
- /*
- * Clear valid bit to not count rollovers on update, rollovers
- * are only updated in the irq handler.
- */
- config &= ~perf_ibs->valid_mask;
-
- perf_ibs_event_update(perf_ibs, event, &config);
- hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
- struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
- struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
- if (test_and_set_bit(IBS_ENABLED, pcpu->state))
- return -ENOSPC;
-
- event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- pcpu->event = event;
-
- if (flags & PERF_EF_START)
- perf_ibs_start(event, PERF_EF_RELOAD);
-
- return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
- struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
- struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
- if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
- return;
-
- perf_ibs_stop(event, PERF_EF_UPDATE);
-
- pcpu->event = NULL;
-
- perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en, "config:57");
-PMU_FORMAT_ATTR(cnt_ctl, "config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
- &format_attr_rand_en.attr,
- NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
- NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
- NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
- .pmu = {
- .task_ctx_nr = perf_invalid_context,
-
- .event_init = perf_ibs_init,
- .add = perf_ibs_add,
- .del = perf_ibs_del,
- .start = perf_ibs_start,
- .stop = perf_ibs_stop,
- .read = perf_ibs_read,
- },
- .msr = MSR_AMD64_IBSFETCHCTL,
- .config_mask = IBS_FETCH_CONFIG_MASK,
- .cnt_mask = IBS_FETCH_MAX_CNT,
- .enable_mask = IBS_FETCH_ENABLE,
- .valid_mask = IBS_FETCH_VAL,
- .max_period = IBS_FETCH_MAX_CNT << 4,
- .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
- .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
- .format_attrs = ibs_fetch_format_attrs,
-
- .get_count = get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
- .pmu = {
- .task_ctx_nr = perf_invalid_context,
-
- .event_init = perf_ibs_init,
- .add = perf_ibs_add,
- .del = perf_ibs_del,
- .start = perf_ibs_start,
- .stop = perf_ibs_stop,
- .read = perf_ibs_read,
- },
- .msr = MSR_AMD64_IBSOPCTL,
- .config_mask = IBS_OP_CONFIG_MASK,
- .cnt_mask = IBS_OP_MAX_CNT,
- .enable_mask = IBS_OP_ENABLE,
- .valid_mask = IBS_OP_VAL,
- .max_period = IBS_OP_MAX_CNT << 4,
- .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
- .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
- .format_attrs = ibs_op_format_attrs,
-
- .get_count = get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
- struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
- struct perf_event *event = pcpu->event;
- struct hw_perf_event *hwc = &event->hw;
- struct perf_sample_data data;
- struct perf_raw_record raw;
- struct pt_regs regs;
- struct perf_ibs_data ibs_data;
- int offset, size, check_rip, offset_max, throttle = 0;
- unsigned int msr;
- u64 *buf, *config, period;
-
- if (!test_bit(IBS_STARTED, pcpu->state)) {
- /*
- * Catch spurious interrupts after stopping IBS: After
- * disabling IBS there could be still incoming NMIs
- * with samples that even have the valid bit cleared.
- * Mark all this NMIs as handled.
- */
- return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
- }
-
- msr = hwc->config_base;
- buf = ibs_data.regs;
- rdmsrl(msr, *buf);
- if (!(*buf++ & perf_ibs->valid_mask))
- return 0;
-
- config = &ibs_data.regs[0];
- perf_ibs_event_update(perf_ibs, event, config);
- perf_sample_data_init(&data, 0, hwc->last_period);
- if (!perf_ibs_set_period(perf_ibs, hwc, &period))
- goto out; /* no sw counter overflow */
-
- ibs_data.caps = ibs_caps;
- size = 1;
- offset = 1;
- check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
- if (event->attr.sample_type & PERF_SAMPLE_RAW)
- offset_max = perf_ibs->offset_max;
- else if (check_rip)
- offset_max = 2;
- else
- offset_max = 1;
- do {
- rdmsrl(msr + offset, *buf++);
- size++;
- offset = find_next_bit(perf_ibs->offset_mask,
- perf_ibs->offset_max,
- offset + 1);
- } while (offset < offset_max);
- ibs_data.size = sizeof(u64) * size;
-
- regs = *iregs;
- if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
- regs.flags &= ~PERF_EFLAGS_EXACT;
- } else {
- set_linear_ip(&regs, ibs_data.regs[1]);
- regs.flags |= PERF_EFLAGS_EXACT;
- }
-
- if (event->attr.sample_type & PERF_SAMPLE_RAW) {
- raw.size = sizeof(u32) + ibs_data.size;
- raw.data = ibs_data.data;
- data.raw = &raw;
- }
-
- throttle = perf_event_overflow(event, &data, &regs);
-out:
- if (throttle)
- perf_ibs_disable_event(perf_ibs, hwc, *config);
- else
- perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
- perf_event_update_userpage(event);
-
- return 1;
-}
-
-static int __kprobes
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
- int handled = 0;
-
- handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
- handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- return handled;
-}
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
- struct cpu_perf_ibs __percpu *pcpu;
- int ret;
-
- pcpu = alloc_percpu(struct cpu_perf_ibs);
- if (!pcpu)
- return -ENOMEM;
-
- perf_ibs->pcpu = pcpu;
-
- /* register attributes */
- if (perf_ibs->format_attrs[0]) {
- memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
- perf_ibs->format_group.name = "format";
- perf_ibs->format_group.attrs = perf_ibs->format_attrs;
-
- memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
- perf_ibs->attr_groups[0] = &perf_ibs->format_group;
- perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
- }
-
- ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
- if (ret) {
- perf_ibs->pcpu = NULL;
- free_percpu(pcpu);
- }
-
- return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
- struct attribute **attr = ibs_op_format_attrs;
-
- if (!ibs_caps)
- return -ENODEV; /* ibs not supported by the cpu */
-
- perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
- if (ibs_caps & IBS_CAPS_OPCNT) {
- perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
- *attr++ = &format_attr_cnt_ctl.attr;
- }
- perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
- register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
- printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
- return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
- u32 caps;
- unsigned int max_level;
-
- if (!boot_cpu_has(X86_FEATURE_IBS))
- return 0;
-
- /* check IBS cpuid feature flags */
- max_level = cpuid_eax(0x80000000);
- if (max_level < IBS_CPUID_FEATURES)
- return IBS_CAPS_DEFAULT;
-
- caps = cpuid_eax(IBS_CPUID_FEATURES);
- if (!(caps & IBS_CAPS_AVAIL))
- /* cpuid flags not valid */
- return IBS_CAPS_DEFAULT;
-
- return caps;
-}
-
-u32 get_ibs_caps(void)
-{
- return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
- return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
- return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
- int offset;
- u64 val;
- int valid = 0;
-
- preempt_disable();
-
- rdmsrl(MSR_AMD64_IBSCTL, val);
- offset = val & IBSCTL_LVT_OFFSET_MASK;
-
- if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
- pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
- smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- goto out;
- }
-
- if (!get_eilvt(offset)) {
- pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
- smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
- goto out;
- }
-
- valid = 1;
-out:
- preempt_enable();
-
- return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
- struct pci_dev *cpu_cfg;
- int nodes;
- u32 value = 0;
-
- nodes = 0;
- cpu_cfg = NULL;
- do {
- cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
- PCI_DEVICE_ID_AMD_10H_NB_MISC,
- cpu_cfg);
- if (!cpu_cfg)
- break;
- ++nodes;
- pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
- | IBSCTL_LVT_OFFSET_VALID);
- pci_read_config_dword(cpu_cfg, IBSCTL, &value);
- if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
- pci_dev_put(cpu_cfg);
- printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
- "IBSCTL = 0x%08x\n", value);
- return -EINVAL;
- }
- } while (1);
-
- if (!nodes) {
- printk(KERN_DEBUG "No CPU node configured for IBS\n");
- return -ENODEV;
- }
-
- return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static int force_ibs_eilvt_setup(void)
-{
- int offset;
- int ret;
-
- preempt_disable();
- /* find the next free available EILVT entry, skip offset 0 */
- for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
- if (get_eilvt(offset))
- break;
- }
- preempt_enable();
-
- if (offset == APIC_EILVT_NR_MAX) {
- printk(KERN_DEBUG "No EILVT entry available\n");
- return -EBUSY;
- }
-
- ret = setup_ibs_ctl(offset);
- if (ret)
- goto out;
-
- if (!ibs_eilvt_valid()) {
- ret = -EFAULT;
- goto out;
- }
-
- pr_info("IBS: LVT offset %d assigned\n", offset);
-
- return 0;
-out:
- preempt_disable();
- put_eilvt(offset);
- preempt_enable();
- return ret;
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
- u64 val;
-
- rdmsrl(MSR_AMD64_IBSCTL, val);
- if (!(val & IBSCTL_LVT_OFFSET_VALID))
- return -EINVAL;
-
- return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
- int offset;
-
- offset = get_ibs_lvt_offset();
- if (offset < 0)
- goto failed;
-
- if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
- return;
-failed:
- pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
- smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
- int offset;
-
- offset = get_ibs_lvt_offset();
- if (offset >= 0)
- setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-static int __cpuinit
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
- setup_APIC_ibs(NULL);
- break;
- case CPU_DYING:
- clear_APIC_ibs(NULL);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
- u32 caps;
- int ret = -EINVAL;
-
- caps = __get_ibs_caps();
- if (!caps)
- return -ENODEV; /* ibs not supported by the cpu */
-
- /*
- * Force LVT offset assignment for family 10h: The offsets are
- * not assigned by the BIOS for this family, so the OS is
- * responsible for doing it. If the OS assignment fails, fall
- * back to BIOS settings and try to setup this.
- */
- if (boot_cpu_data.x86 == 0x10)
- force_ibs_eilvt_setup();
-
- if (!ibs_eilvt_valid())
- goto out;
-
- get_online_cpus();
- ibs_caps = caps;
- /* make ibs_caps visible to other cpus: */
- smp_mb();
- perf_cpu_notifier(perf_ibs_cpu_notifier);
- smp_call_function(setup_APIC_ibs, NULL, 1);
- put_online_cpus();
-
- ret = perf_event_ibs_init();
-out:
- if (ret)
- pr_err("Failed to setup IBS, %d\n", ret);
- return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
deleted file mode 100644
index 639d1289b1ba..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT 16
-
-#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
-#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
- struct pmu pmu;
- u8 max_banks;
- u8 max_counters;
- u64 cntr_assign_mask;
- raw_spinlock_t lock;
- const struct attribute_group *attr_groups[4];
-};
-
-#define format_group attr_groups[0]
-#define cpumask_group attr_groups[1]
-#define events_group attr_groups[2]
-#define null_group attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource, "config:0-7");
-PMU_FORMAT_ATTR(devid, "config:8-23");
-PMU_FORMAT_ATTR(pasid, "config:24-39");
-PMU_FORMAT_ATTR(domid, "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
- &format_attr_csource.attr,
- &format_attr_devid.attr,
- &format_attr_pasid.attr,
- &format_attr_domid.attr,
- &format_attr_devid_mask.attr,
- &format_attr_pasid_mask.attr,
- &format_attr_domid_mask.attr,
- NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
- .name = "format",
- .attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
- struct kobj_attribute attr;
- const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct amd_iommu_event_desc *event =
- container_of(attr, struct amd_iommu_event_desc, attr);
- return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event) \
-{ \
- .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \
- .event = _event, \
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
- AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"),
- AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"),
- AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"),
- AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"),
- AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"),
- AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"),
- AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"),
- AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"),
- AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"),
- AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"),
- AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"),
- AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"),
- AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"),
- AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"),
- AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"),
- AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"),
- AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
- AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
- AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
- { /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
- .attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
- unsigned long flags;
- int shift, bank, cntr, retval;
- int max_banks = perf_iommu->max_banks;
- int max_cntrs = perf_iommu->max_counters;
-
- raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
- for (bank = 0, shift = 0; bank < max_banks; bank++) {
- for (cntr = 0; cntr < max_cntrs; cntr++) {
- shift = bank + (bank*3) + cntr;
- if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
- continue;
- } else {
- perf_iommu->cntr_assign_mask |= (1ULL<<shift);
- retval = ((u16)((u16)bank<<8) | (u8)(cntr));
- goto out;
- }
- }
- }
- retval = -ENOSPC;
-out:
- raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
- return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
- u8 bank, u8 cntr)
-{
- unsigned long flags;
- int max_banks, max_cntrs;
- int shift = 0;
-
- max_banks = perf_iommu->max_banks;
- max_cntrs = perf_iommu->max_counters;
-
- if ((bank > max_banks) || (cntr > max_cntrs))
- return -EINVAL;
-
- shift = bank + cntr + (bank*3);
-
- raw_spin_lock_irqsave(&perf_iommu->lock, flags);
- perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
- raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
- return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct perf_amd_iommu *perf_iommu;
- u64 config, config1;
-
- /* test the event attr type check for PMU enumeration */
- if (event->attr.type != event->pmu->type)
- return -ENOENT;
-
- /*
- * IOMMU counters are shared across all cores.
- * Therefore, it does not support per-process mode.
- * Also, it does not support event sampling mode.
- */
- if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
- return -EINVAL;
-
- /* IOMMU counters do not have usr/os/guest/host bits */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_host || event->attr.exclude_guest)
- return -EINVAL;
-
- if (event->cpu < 0)
- return -EINVAL;
-
- perf_iommu = &__perf_iommu;
-
- if (event->pmu != &perf_iommu->pmu)
- return -ENOENT;
-
- if (perf_iommu) {
- config = event->attr.config;
- config1 = event->attr.config1;
- } else {
- return -EINVAL;
- }
-
- /* integrate with iommu base devid (0000), assume one iommu */
- perf_iommu->max_banks =
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
- perf_iommu->max_counters =
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
- if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
- return -EINVAL;
-
- /* update the hw_perf_event struct with the iommu config data */
- hwc->config = config;
- hwc->extra_reg.config = config1;
-
- return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
- u8 csource = _GET_CSOURCE(ev);
- u16 devid = _GET_DEVID(ev);
- u64 reg = 0ULL;
-
- reg = csource;
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
- reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
- if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
- reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
- if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
- reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
- if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
- u64 reg = 0ULL;
-
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- pr_debug("perf: amd_iommu:perf_iommu_start\n");
- if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
- return;
-
- WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
- hwc->state = 0;
-
- if (flags & PERF_EF_RELOAD) {
- u64 prev_raw_count = local64_read(&hwc->prev_count);
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
- }
-
- perf_iommu_enable_event(event);
- perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
- u64 count = 0ULL;
- u64 prev_raw_count = 0ULL;
- u64 delta = 0ULL;
- struct hw_perf_event *hwc = &event->hw;
- pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &count, false);
-
- /* IOMMU pc counter register is only 48 bits */
- count &= 0xFFFFFFFFFFFFULL;
-
- prev_raw_count = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- count) != prev_raw_count)
- return;
-
- /* Handling 48-bit counter overflowing */
- delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
- delta >>= COUNTER_SHIFT;
- local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
- if (hwc->state & PERF_HES_UPTODATE)
- return;
-
- perf_iommu_disable_event(event);
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
-
- if (hwc->state & PERF_HES_UPTODATE)
- return;
-
- config = hwc->config;
- perf_iommu_read(event);
- hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
- int retval;
- struct perf_amd_iommu *perf_iommu =
- container_of(event->pmu, struct perf_amd_iommu, pmu);
-
- pr_debug("perf: amd_iommu:perf_iommu_add\n");
- event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- /* request an iommu bank/counter */
- retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
- if (retval != -ENOSPC)
- event->hw.extra_reg.reg = (u16)retval;
- else
- return retval;
-
- if (flags & PERF_EF_START)
- perf_iommu_start(event, PERF_EF_RELOAD);
-
- return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
- struct perf_amd_iommu *perf_iommu =
- container_of(event->pmu, struct perf_amd_iommu, pmu);
-
- pr_debug("perf: amd_iommu:perf_iommu_del\n");
- perf_iommu_stop(event, PERF_EF_UPDATE);
-
- /* clear the assigned iommu bank/counter */
- clear_avail_iommu_bnk_cntr(perf_iommu,
- _GET_BANK(event),
- _GET_CNTR(event));
-
- perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
- struct attribute **attrs;
- struct attribute_group *attr_group;
- int i = 0, j;
-
- while (amd_iommu_v2_event_descs[i].attr.attr.name)
- i++;
-
- attr_group = kzalloc(sizeof(struct attribute *)
- * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
- if (!attr_group)
- return -ENOMEM;
-
- attrs = (struct attribute **)(attr_group + 1);
- for (j = 0; j < i; j++)
- attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
- attr_group->name = "events";
- attr_group->attrs = attrs;
- perf_iommu->events_group = attr_group;
-
- return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
- if (__perf_iommu.events_group != NULL) {
- kfree(__perf_iommu.events_group);
- __perf_iommu.events_group = NULL;
- }
-}
-
-static __init int _init_perf_amd_iommu(
- struct perf_amd_iommu *perf_iommu, char *name)
-{
- int ret;
-
- raw_spin_lock_init(&perf_iommu->lock);
-
- /* Init format attributes */
- perf_iommu->format_group = &amd_iommu_format_group;
-
- /* Init cpumask attributes to only core 0 */
- cpumask_set_cpu(0, &iommu_cpumask);
- perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
- /* Init events attributes */
- if (_init_events_attrs(perf_iommu) != 0)
- pr_err("perf: amd_iommu: Only support raw events.\n");
-
- /* Init null attributes */
- perf_iommu->null_group = NULL;
- perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
- ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
- if (ret) {
- pr_err("perf: amd_iommu: Failed to initialized.\n");
- amd_iommu_pc_exit();
- } else {
- pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
- }
-
- return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
- .pmu = {
- .event_init = perf_iommu_event_init,
- .add = perf_iommu_add,
- .del = perf_iommu_del,
- .start = perf_iommu_start,
- .stop = perf_iommu_stop,
- .read = perf_iommu_read,
- },
- .max_banks = 0x00,
- .max_counters = 0x00,
- .cntr_assign_mask = 0ULL,
- .format_group = NULL,
- .cpumask_group = NULL,
- .events_group = NULL,
- .null_group = NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
- /* Make sure the IOMMU PC resource is available */
- if (!amd_iommu_pc_supported())
- return -ENODEV;
-
- _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
- return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
deleted file mode 100644
index 845d173278e3..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG 0x00
-#define IOMMU_PC_COUNTER_SRC_REG 0x08
-#define IOMMU_PC_PASID_MATCH_REG 0x10
-#define IOMMU_PC_DOMID_MATCH_REG 0x18
-#define IOMMU_PC_DEVID_MATCH_REG 0x20
-#define IOMMU_PC_COUNTER_REPORT_REG 0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS 64
-#define PC_MAX_SPEC_CNTRS 16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID 0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
- u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
deleted file mode 100644
index c0c661adf03e..000000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ /dev/null
@@ -1,547 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB 4
-#define NUM_COUNTERS_L2 4
-#define MAX_COUNTERS NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB 6
-#define RDPMC_BASE_L2 10
-
-#define COUNTER_SHIFT 16
-
-struct amd_uncore {
- int id;
- int refcnt;
- int cpu;
- int num_counters;
- int rdpmc_base;
- u32 msr_base;
- cpumask_t *active_mask;
- struct pmu *pmu;
- struct perf_event *events[MAX_COUNTERS];
- struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
- return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
- return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
- if (is_nb_event(event) && amd_uncore_nb)
- return *per_cpu_ptr(amd_uncore_nb, event->cpu);
- else if (is_l2_event(event) && amd_uncore_l2)
- return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
- return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 prev, new;
- s64 delta;
-
- /*
- * since we do not enable counter overflow interrupts,
- * we do not have to worry about prev_count changing on us
- */
-
- prev = local64_read(&hwc->prev_count);
- rdpmcl(hwc->event_base_rdpmc, new);
- local64_set(&hwc->prev_count, new);
- delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
- delta >>= COUNTER_SHIFT;
- local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (flags & PERF_EF_RELOAD)
- wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
- hwc->state = 0;
- wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
- perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base, hwc->config);
- hwc->state |= PERF_HES_STOPPED;
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- amd_uncore_read(event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
- int i;
- struct amd_uncore *uncore = event_to_amd_uncore(event);
- struct hw_perf_event *hwc = &event->hw;
-
- /* are we already assigned? */
- if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
- goto out;
-
- for (i = 0; i < uncore->num_counters; i++) {
- if (uncore->events[i] == event) {
- hwc->idx = i;
- goto out;
- }
- }
-
- /* if not, take the first available counter */
- hwc->idx = -1;
- for (i = 0; i < uncore->num_counters; i++) {
- if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
- hwc->idx = i;
- break;
- }
- }
-
-out:
- if (hwc->idx == -1)
- return -EBUSY;
-
- hwc->config_base = uncore->msr_base + (2 * hwc->idx);
- hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
- hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
- if (flags & PERF_EF_START)
- amd_uncore_start(event, PERF_EF_RELOAD);
-
- return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
- int i;
- struct amd_uncore *uncore = event_to_amd_uncore(event);
- struct hw_perf_event *hwc = &event->hw;
-
- amd_uncore_stop(event, PERF_EF_UPDATE);
-
- for (i = 0; i < uncore->num_counters; i++) {
- if (cmpxchg(&uncore->events[i], event, NULL) == event)
- break;
- }
-
- hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
- struct amd_uncore *uncore;
- struct hw_perf_event *hwc = &event->hw;
-
- if (event->attr.type != event->pmu->type)
- return -ENOENT;
-
- /*
- * NB and L2 counters (MSRs) are shared across all cores that share the
- * same NB / L2 cache. Interrupts can be directed to a single target
- * core, however, event counts generated by processes running on other
- * cores cannot be masked out. So we do not support sampling and
- * per-thread events.
- */
- if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
- return -EINVAL;
-
- /* NB and L2 counters do not have usr/os/guest/host bits */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_host || event->attr.exclude_guest)
- return -EINVAL;
-
- /* and we do not enable counter overflow interrupts */
- hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
- hwc->idx = -1;
-
- if (event->cpu < 0)
- return -EINVAL;
-
- uncore = event_to_amd_uncore(event);
- if (!uncore)
- return -ENODEV;
-
- /*
- * since request can come in to any of the shared cores, we will remap
- * to a single common cpu.
- */
- event->cpu = uncore->cpu;
-
- return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- int n;
- cpumask_t *active_mask;
- struct pmu *pmu = dev_get_drvdata(dev);
-
- if (pmu->type == amd_nb_pmu.type)
- active_mask = &amd_nb_active_mask;
- else if (pmu->type == amd_l2_pmu.type)
- active_mask = &amd_l2_active_mask;
- else
- return 0;
-
- n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
- .attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
- .name = "format",
- .attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
- &amd_uncore_attr_group,
- &amd_uncore_format_group,
- NULL,
-};
-
-static struct pmu amd_nb_pmu = {
- .attr_groups = amd_uncore_attr_groups,
- .name = "amd_nb",
- .event_init = amd_uncore_event_init,
- .add = amd_uncore_add,
- .del = amd_uncore_del,
- .start = amd_uncore_start,
- .stop = amd_uncore_stop,
- .read = amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
- .attr_groups = amd_uncore_attr_groups,
- .name = "amd_l2",
- .event_init = amd_uncore_event_init,
- .add = amd_uncore_add,
- .del = amd_uncore_del,
- .start = amd_uncore_start,
- .stop = amd_uncore_stop,
- .read = amd_uncore_read,
-};
-
-static struct amd_uncore * __cpuinit amd_uncore_alloc(unsigned int cpu)
-{
- return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
- cpu_to_node(cpu));
-}
-
-static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
- struct amd_uncore *uncore;
-
- if (amd_uncore_nb) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_NB;
- uncore->rdpmc_base = RDPMC_BASE_NB;
- uncore->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore->active_mask = &amd_nb_active_mask;
- uncore->pmu = &amd_nb_pmu;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
- }
-
- if (amd_uncore_l2) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_L2;
- uncore->rdpmc_base = RDPMC_BASE_L2;
- uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore->active_mask = &amd_l2_active_mask;
- uncore->pmu = &amd_l2_pmu;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
- }
-}
-
-static struct amd_uncore *
-__cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this,
- struct amd_uncore * __percpu *uncores)
-{
- unsigned int cpu;
- struct amd_uncore *that;
-
- for_each_online_cpu(cpu) {
- that = *per_cpu_ptr(uncores, cpu);
-
- if (!that)
- continue;
-
- if (this == that)
- continue;
-
- if (this->id == that->id) {
- that->free_when_cpu_online = this;
- this = that;
- break;
- }
- }
-
- this->refcnt++;
- return this;
-}
-
-static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu)
-{
- unsigned int eax, ebx, ecx, edx;
- struct amd_uncore *uncore;
-
- if (amd_uncore_nb) {
- uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- uncore->id = ecx & 0xff;
-
- uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
- }
-
- if (amd_uncore_l2) {
- unsigned int apicid = cpu_data(cpu).apicid;
- unsigned int nshared;
-
- uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
- cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
- nshared = ((eax >> 14) & 0xfff) + 1;
- uncore->id = apicid - (apicid % nshared);
-
- uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
- }
-}
-
-static void __cpuinit uncore_online(unsigned int cpu,
- struct amd_uncore * __percpu *uncores)
-{
- struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
- kfree(uncore->free_when_cpu_online);
- uncore->free_when_cpu_online = NULL;
-
- if (cpu == uncore->cpu)
- cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void __cpuinit amd_uncore_cpu_online(unsigned int cpu)
-{
- if (amd_uncore_nb)
- uncore_online(cpu, amd_uncore_nb);
-
- if (amd_uncore_l2)
- uncore_online(cpu, amd_uncore_l2);
-}
-
-static void __cpuinit uncore_down_prepare(unsigned int cpu,
- struct amd_uncore * __percpu *uncores)
-{
- unsigned int i;
- struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
- if (this->cpu != cpu)
- return;
-
- /* this cpu is going down, migrate to a shared sibling if possible */
- for_each_online_cpu(i) {
- struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
- if (cpu == i)
- continue;
-
- if (this == that) {
- perf_pmu_migrate_context(this->pmu, cpu, i);
- cpumask_clear_cpu(cpu, that->active_mask);
- cpumask_set_cpu(i, that->active_mask);
- that->cpu = i;
- break;
- }
- }
-}
-
-static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
- if (amd_uncore_nb)
- uncore_down_prepare(cpu, amd_uncore_nb);
-
- if (amd_uncore_l2)
- uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void __cpuinit uncore_dead(unsigned int cpu,
- struct amd_uncore * __percpu *uncores)
-{
- struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
- if (cpu == uncore->cpu)
- cpumask_clear_cpu(cpu, uncore->active_mask);
-
- if (!--uncore->refcnt)
- kfree(uncore);
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
-}
-
-static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu)
-{
- if (amd_uncore_nb)
- uncore_dead(cpu, amd_uncore_nb);
-
- if (amd_uncore_l2)
- uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int __cpuinit
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
- void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- amd_uncore_cpu_up_prepare(cpu);
- break;
-
- case CPU_STARTING:
- amd_uncore_cpu_starting(cpu);
- break;
-
- case CPU_ONLINE:
- amd_uncore_cpu_online(cpu);
- break;
-
- case CPU_DOWN_PREPARE:
- amd_uncore_cpu_down_prepare(cpu);
- break;
-
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- amd_uncore_cpu_dead(cpu);
- break;
-
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block __cpuinitdata = {
- .notifier_call = amd_uncore_cpu_notifier,
- .priority = CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
- unsigned int cpu = smp_processor_id();
-
- amd_uncore_cpu_starting(cpu);
- amd_uncore_cpu_online(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
- unsigned int cpu;
- int ret = -ENODEV;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return -ENODEV;
-
- if (!cpu_has_topoext)
- return -ENODEV;
-
- if (cpu_has_perfctr_nb) {
- amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-
- printk(KERN_INFO "perf: AMD NB counters detected\n");
- ret = 0;
- }
-
- if (cpu_has_perfctr_l2) {
- amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-
- printk(KERN_INFO "perf: AMD L2I counters detected\n");
- ret = 0;
- }
-
- if (ret)
- return -ENODEV;
-
- get_online_cpus();
- /* init cpus already online before registering for hotplug notifier */
- for_each_online_cpu(cpu) {
- amd_uncore_cpu_up_prepare(cpu);
- smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
- }
-
- register_cpu_notifier(&amd_uncore_cpu_notifier_block);
- put_online_cpus();
-
- return 0;
-}
-device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
deleted file mode 100644
index fbc9210b45bc..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,2349 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
- [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
- [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
- INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
- INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
- INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
- INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
- INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
- INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
- INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
- INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
- INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
- EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
- INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
- EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
- INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
- INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
- INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
- INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
- INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
- INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
- INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
- INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
- INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
- INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- /*
- * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
- * siblings; disable these events because they can corrupt unrelated
- * counters.
- */
- INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
- EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
- INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
- EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
- INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
- EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
- INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
- INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
- INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
- EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
- EVENT_PTR(mem_ld_nhm),
- NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
- EVENT_PTR(mem_ld_snb),
- EVENT_PTR(mem_st_snb),
- NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
- FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
- FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
- INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
- /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
- INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
- /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
- INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
- /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
- INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
- EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
- return intel_perfmon_event_map[hw_event];
-}
-
-#define SNB_DMND_DATA_RD (1ULL << 0)
-#define SNB_DMND_RFO (1ULL << 1)
-#define SNB_DMND_IFETCH (1ULL << 2)
-#define SNB_DMND_WB (1ULL << 3)
-#define SNB_PF_DATA_RD (1ULL << 4)
-#define SNB_PF_RFO (1ULL << 5)
-#define SNB_PF_IFETCH (1ULL << 6)
-#define SNB_LLC_DATA_RD (1ULL << 7)
-#define SNB_LLC_RFO (1ULL << 8)
-#define SNB_LLC_IFETCH (1ULL << 9)
-#define SNB_BUS_LOCKS (1ULL << 10)
-#define SNB_STRM_ST (1ULL << 11)
-#define SNB_OTHER (1ULL << 15)
-#define SNB_RESP_ANY (1ULL << 16)
-#define SNB_NO_SUPP (1ULL << 17)
-#define SNB_LLC_HITM (1ULL << 18)
-#define SNB_LLC_HITE (1ULL << 19)
-#define SNB_LLC_HITS (1ULL << 20)
-#define SNB_LLC_HITF (1ULL << 21)
-#define SNB_LOCAL (1ULL << 22)
-#define SNB_REMOTE (0xffULL << 23)
-#define SNB_SNP_NONE (1ULL << 31)
-#define SNB_SNP_NOT_NEEDED (1ULL << 32)
-#define SNB_SNP_MISS (1ULL << 33)
-#define SNB_NO_FWD (1ULL << 34)
-#define SNB_SNP_FWD (1ULL << 35)
-#define SNB_HITM (1ULL << 36)
-#define SNB_NON_DRAM (1ULL << 37)
-
-#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
- SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
- SNB_HITM)
-
-#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS SNB_RESP_ANY
-#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
- [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
- [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
- [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
- [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
- [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
- [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
- },
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
- [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
- [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
- [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
-
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
- [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- /*
- * Use RFO, not WRITEBACK, because a write miss would typically occur
- * on RFO.
- */
- [ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD (1 << 0)
-#define NHM_DMND_RFO (1 << 1)
-#define NHM_DMND_IFETCH (1 << 2)
-#define NHM_DMND_WB (1 << 3)
-#define NHM_PF_DATA_RD (1 << 4)
-#define NHM_PF_DATA_RFO (1 << 5)
-#define NHM_PF_IFETCH (1 << 6)
-#define NHM_OFFCORE_OTHER (1 << 7)
-#define NHM_UNCORE_HIT (1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
-#define NHM_OTHER_CORE_HITM (1 << 10)
- /* reserved */
-#define NHM_REMOTE_CACHE_FWD (1 << 12)
-#define NHM_REMOTE_DRAM (1 << 13)
-#define NHM_LOCAL_DRAM (1 << 14)
-#define NHM_NON_DRAM (1 << 15)
-
-#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE (NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ (NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
- [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
- [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
- [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
- [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
- [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
- [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE,
- },
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
- [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
- [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- /*
- * Use RFO, not WRITEBACK, because a write miss would typically occur
- * on RFO.
- */
- [ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
- [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x01b7,
- [ C(RESULT_MISS) ] = 0x01b7,
- },
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
- [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
- [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
- [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
- [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
- [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
- [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
-static void intel_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
- intel_pmu_disable_bts();
-
- intel_pmu_pebs_disable_all();
- intel_pmu_lbr_disable_all();
-}
-
-static void intel_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
- x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
- if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
- struct perf_event *event =
- cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
- if (WARN_ON_ONCE(!event))
- return;
-
- intel_pmu_enable_bts(event->hw.config);
- }
-}
-
-/*
- * Workaround for:
- * Intel Errata AAK100 (model 26)
- * Intel Errata AAP53 (model 30)
- * Intel Errata BD53 (model 44)
- *
- * The official story:
- * These chips need to be 'reset' when adding counters by programming the
- * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- * in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- static const unsigned long nhm_magic[4] = {
- 0x4300B5,
- 0x4300D2,
- 0x4300B1,
- 0x4300B1
- };
- struct perf_event *event;
- int i;
-
- /*
- * The Errata requires below steps:
- * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
- * 2) Configure 4 PERFEVTSELx with the magic events and clear
- * the corresponding PMCx;
- * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
- * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
- * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
- */
-
- /*
- * The real steps we choose are a little different from above.
- * A) To reduce MSR operations, we don't run step 1) as they
- * are already cleared before this function is called;
- * B) Call x86_perf_event_update to save PMCx before configuring
- * PERFEVTSELx with magic number;
- * C) With step 5), we do clear only when the PERFEVTSELx is
- * not used currently.
- * D) Call x86_perf_event_set_period to restore PMCx;
- */
-
- /* We always operate 4 pairs of PERF Counters */
- for (i = 0; i < 4; i++) {
- event = cpuc->events[i];
- if (event)
- x86_perf_event_update(event);
- }
-
- for (i = 0; i < 4; i++) {
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
- wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
- }
-
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
- wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
- for (i = 0; i < 4; i++) {
- event = cpuc->events[i];
-
- if (event) {
- x86_perf_event_set_period(event);
- __x86_pmu_enable_event(&event->hw,
- ARCH_PERFMON_EVENTSEL_ENABLE);
- } else
- wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
- }
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
- if (added)
- intel_pmu_nhm_workaround();
- intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
- u64 status;
-
- rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
- return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
- int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
- u64 ctrl_val, mask;
-
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
- intel_pmu_disable_bts();
- intel_pmu_drain_bts_buffer();
- return;
- }
-
- cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
- cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-
- /*
- * must disable before any actual event
- * because any event may be combined with LBR
- */
- if (intel_pmu_needs_lbr_smpl(event))
- intel_pmu_lbr_disable(event);
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_disable_fixed(hwc);
- return;
- }
-
- x86_pmu_disable_event(event);
-
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
- int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
- u64 ctrl_val, bits, mask;
-
- /*
- * Enable IRQ generation (0x8),
- * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
- * if requested:
- */
- bits = 0x8ULL;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
- bits |= 0x2;
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
- bits |= 0x1;
-
- /*
- * ANY bit is supported in v3 and up
- */
- if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
- bits |= 0x4;
-
- bits <<= (idx * 4);
- mask = 0xfULL << (idx * 4);
-
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
- if (!__this_cpu_read(cpu_hw_events.enabled))
- return;
-
- intel_pmu_enable_bts(hwc->config);
- return;
- }
- /*
- * must enabled before any actual event
- * because any event may be combined with LBR
- */
- if (intel_pmu_needs_lbr_smpl(event))
- intel_pmu_lbr_enable(event);
-
- if (event->attr.exclude_host)
- cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
- if (event->attr.exclude_guest)
- cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
- if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
- intel_pmu_enable_fixed(hwc);
- return;
- }
-
- if (unlikely(event->attr.precise_ip))
- intel_pmu_pebs_enable(event);
-
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
- x86_perf_event_update(event);
- return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
- struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
- unsigned long flags;
- int idx;
-
- if (!x86_pmu.num_counters)
- return;
-
- local_irq_save(flags);
-
- pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
- wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
- }
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
- wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
- if (ds)
- ds->bts_index = ds->bts_buffer_base;
-
- local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int bit, loops;
- u64 status;
- int handled;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- /*
- * No known reason to not always do late ACK,
- * but just in case do it opt-in.
- */
- if (!x86_pmu.late_ack)
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_pmu_disable_all();
- handled = intel_pmu_drain_bts_buffer();
- status = intel_pmu_get_status();
- if (!status) {
- intel_pmu_enable_all(0);
- return handled;
- }
-
- loops = 0;
-again:
- intel_pmu_ack_status(status);
- if (++loops > 100) {
- static bool warned = false;
- if (!warned) {
- WARN(1, "perfevents: irq loop stuck!\n");
- perf_event_print_debug();
- warned = true;
- }
- intel_pmu_reset();
- goto done;
- }
-
- inc_irq_stat(apic_perf_irqs);
-
- intel_pmu_lbr_read();
-
- /*
- * PEBS overflow sets bit 62 in the global status register
- */
- if (__test_and_clear_bit(62, (unsigned long *)&status)) {
- handled++;
- x86_pmu.drain_pebs(regs);
- }
-
- for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_event *event = cpuc->events[bit];
-
- handled++;
-
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- if (!intel_pmu_save_and_restart(event))
- continue;
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
-
- if (has_branch_stack(event))
- data.br_stack = &cpuc->lbr_stack;
-
- if (perf_event_overflow(event, &data, regs))
- x86_pmu_stop(event, 0);
- }
-
- /*
- * Repeat if there is more work to be done:
- */
- status = intel_pmu_get_status();
- if (status)
- goto again;
-
-done:
- intel_pmu_enable_all(0);
- /*
- * Only unmask the NMI after the overflow counters
- * have been reset. This avoids spurious NMIs on
- * Haswell CPUs.
- */
- if (x86_pmu.late_ack)
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- unsigned int hw_event, bts_event;
-
- if (event->attr.freq)
- return NULL;
-
- hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
- bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
- if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
- return &bts_constraint;
-
- return NULL;
-}
-
-static int intel_alt_er(int idx)
-{
- if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
- return idx;
-
- if (idx == EXTRA_REG_RSP_0)
- return EXTRA_REG_RSP_1;
-
- if (idx == EXTRA_REG_RSP_1)
- return EXTRA_REG_RSP_0;
-
- return idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
- event->hw.extra_reg.idx = idx;
-
- if (idx == EXTRA_REG_RSP_0) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01b7;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
- } else if (idx == EXTRA_REG_RSP_1) {
- event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= 0x01bb;
- event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
- }
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event,
- struct hw_perf_event_extra *reg)
-{
- struct event_constraint *c = &emptyconstraint;
- struct er_account *era;
- unsigned long flags;
- int idx = reg->idx;
-
- /*
- * reg->alloc can be set due to existing state, so for fake cpuc we
- * need to ignore this, otherwise we might fail to allocate proper fake
- * state for this extra reg constraint. Also see the comment below.
- */
- if (reg->alloc && !cpuc->is_fake)
- return NULL; /* call x86_get_event_constraint() */
-
-again:
- era = &cpuc->shared_regs->regs[idx];
- /*
- * we use spin_lock_irqsave() to avoid lockdep issues when
- * passing a fake cpuc
- */
- raw_spin_lock_irqsave(&era->lock, flags);
-
- if (!atomic_read(&era->ref) || era->config == reg->config) {
-
- /*
- * If its a fake cpuc -- as per validate_{group,event}() we
- * shouldn't touch event state and we can avoid doing so
- * since both will only call get_event_constraints() once
- * on each event, this avoids the need for reg->alloc.
- *
- * Not doing the ER fixup will only result in era->reg being
- * wrong, but since we won't actually try and program hardware
- * this isn't a problem either.
- */
- if (!cpuc->is_fake) {
- if (idx != reg->idx)
- intel_fixup_er(event, idx);
-
- /*
- * x86_schedule_events() can call get_event_constraints()
- * multiple times on events in the case of incremental
- * scheduling(). reg->alloc ensures we only do the ER
- * allocation once.
- */
- reg->alloc = 1;
- }
-
- /* lock in msr value */
- era->config = reg->config;
- era->reg = reg->reg;
-
- /* one more user */
- atomic_inc(&era->ref);
-
- /*
- * need to call x86_get_event_constraint()
- * to check if associated event has constraints
- */
- c = NULL;
- } else {
- idx = intel_alt_er(idx);
- if (idx != reg->idx) {
- raw_spin_unlock_irqrestore(&era->lock, flags);
- goto again;
- }
- }
- raw_spin_unlock_irqrestore(&era->lock, flags);
-
- return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
- struct hw_perf_event_extra *reg)
-{
- struct er_account *era;
-
- /*
- * Only put constraint if extra reg was actually allocated. Also takes
- * care of event which do not use an extra shared reg.
- *
- * Also, if this is a fake cpuc we shouldn't touch any event state
- * (reg->alloc) and we don't care about leaving inconsistent cpuc state
- * either since it'll be thrown out.
- */
- if (!reg->alloc || cpuc->is_fake)
- return;
-
- era = &cpuc->shared_regs->regs[reg->idx];
-
- /* one fewer user */
- atomic_dec(&era->ref);
-
- /* allocate again next time */
- reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- struct event_constraint *c = NULL, *d;
- struct hw_perf_event_extra *xreg, *breg;
-
- xreg = &event->hw.extra_reg;
- if (xreg->idx != EXTRA_REG_NONE) {
- c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
- if (c == &emptyconstraint)
- return c;
- }
- breg = &event->hw.branch_reg;
- if (breg->idx != EXTRA_REG_NONE) {
- d = __intel_shared_reg_get_constraints(cpuc, event, breg);
- if (d == &emptyconstraint) {
- __intel_shared_reg_put_constraints(cpuc, xreg);
- c = d;
- }
- }
- return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct event_constraint *c;
-
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if ((event->hw.config & c->cmask) == c->code) {
- event->hw.flags |= c->flags;
- return c;
- }
- }
- }
-
- return &unconstrained;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct event_constraint *c;
-
- c = intel_bts_constraints(event);
- if (c)
- return c;
-
- c = intel_pebs_constraints(event);
- if (c)
- return c;
-
- c = intel_shared_regs_constraints(cpuc, event);
- if (c)
- return c;
-
- return x86_get_event_constraints(cpuc, event);
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- struct hw_perf_event_extra *reg;
-
- reg = &event->hw.extra_reg;
- if (reg->idx != EXTRA_REG_NONE)
- __intel_shared_reg_put_constraints(cpuc, reg);
-
- reg = &event->hw.branch_reg;
- if (reg->idx != EXTRA_REG_NONE)
- __intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
- struct perf_event *event)
-{
- intel_put_shared_regs_event_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
- if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
- /*
- * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
- * (0x003c) so that we can use it with PEBS.
- *
- * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
- * PEBS capable. However we can use INST_RETIRED.ANY_P
- * (0x00c0), which is a PEBS capable event, to get the same
- * count.
- *
- * INST_RETIRED.ANY_P counts the number of cycles that retires
- * CNTMASK instructions. By setting CNTMASK to a value (16)
- * larger than the maximum number of instructions that can be
- * retired per cycle (4) and then inverting the condition, we
- * count all cycles that retire 16 or less instructions, which
- * is every cycle.
- *
- * Thereby we gain a PEBS capable cycle counter.
- */
- u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
- alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
- event->hw.config = alt_config;
- }
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
- if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
- /*
- * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
- * (0x003c) so that we can use it with PEBS.
- *
- * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
- * PEBS capable. However we can use UOPS_RETIRED.ALL
- * (0x01c2), which is a PEBS capable event, to get the same
- * count.
- *
- * UOPS_RETIRED.ALL counts the number of cycles that retires
- * CNTMASK micro-ops. By setting CNTMASK to a value (16)
- * larger than the maximum number of micro-ops that can be
- * retired per cycle (4) and then inverting the condition, we
- * count all cycles that retire 16 or less micro-ops, which
- * is every cycle.
- *
- * Thereby we gain a PEBS capable cycle counter.
- */
- u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
- alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
- event->hw.config = alt_config;
- }
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
- int ret = x86_pmu_hw_config(event);
-
- if (ret)
- return ret;
-
- if (event->attr.precise_ip && x86_pmu.pebs_aliases)
- x86_pmu.pebs_aliases(event);
-
- if (intel_pmu_needs_lbr_smpl(event)) {
- ret = intel_pmu_setup_lbr_filter(event);
- if (ret)
- return ret;
- }
-
- if (event->attr.type != PERF_TYPE_RAW)
- return 0;
-
- if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
- return 0;
-
- if (x86_pmu.version < 3)
- return -EINVAL;
-
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
- return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- if (x86_pmu.guest_get_msrs)
- return x86_pmu.guest_get_msrs(nr);
- *nr = 0;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
- arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
- arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
- arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
- /*
- * If PMU counter has PEBS enabled it is not enough to disable counter
- * on a guest entry since PEBS memory write can overshoot guest entry
- * and corrupt guest memory. Disabling PEBS solves the problem.
- */
- arr[1].msr = MSR_IA32_PEBS_ENABLE;
- arr[1].host = cpuc->pebs_enabled;
- arr[1].guest = 0;
-
- *nr = 2;
- return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
-
- arr[idx].msr = x86_pmu_config_addr(idx);
- arr[idx].host = arr[idx].guest = 0;
-
- if (!test_bit(idx, cpuc->active_mask))
- continue;
-
- arr[idx].host = arr[idx].guest =
- event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
- if (event->attr.exclude_host)
- arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- else if (event->attr.exclude_guest)
- arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- }
-
- *nr = x86_pmu.num_counters;
- return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
- if (!event->attr.exclude_host)
- x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
- if (!test_bit(idx, cpuc->active_mask) ||
- cpuc->events[idx]->attr.exclude_host)
- continue;
-
- __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
- }
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
- int ret = intel_pmu_hw_config(event);
-
- if (ret)
- return ret;
- if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
- return 0;
- event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
- /*
- * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
- * PEBS or in ANY thread mode. Since the results are non-sensical forbid
- * this combination.
- */
- if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
- ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
- event->attr.precise_ip > 0))
- return -EOPNOTSUPP;
-
- return 0;
-}
-
-static struct event_constraint counter2_constraint =
- EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
-{
- struct event_constraint *c = intel_get_event_constraints(cpuc, event);
-
- /* Handle special quirk on in_tx_checkpointed only in counter 2 */
- if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
- if (c->idxmsk64 & (1U << 2))
- return &counter2_constraint;
- return &emptyconstraint;
- }
-
- return c;
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7" );
-PMU_FORMAT_ATTR(umask, "config:8-15" );
-PMU_FORMAT_ATTR(edge, "config:18" );
-PMU_FORMAT_ATTR(pc, "config:19" );
-PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
-PMU_FORMAT_ATTR(inv, "config:23" );
-PMU_FORMAT_ATTR(cmask, "config:24-31" );
-PMU_FORMAT_ATTR(in_tx, "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_pc.attr,
- &format_attr_inv.attr,
- &format_attr_cmask.attr,
- NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
- u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
- return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu core_pmu = {
- .name = "core",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = core_pmu_enable_all,
- .enable = core_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .get_event_constraints = intel_get_event_constraints,
- .put_event_constraints = intel_put_event_constraints,
- .event_constraints = intel_core_event_constraints,
- .guest_get_msrs = core_guest_get_msrs,
- .format_attrs = intel_arch_formats_attr,
- .events_sysfs_show = intel_event_sysfs_show,
-};
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
- struct intel_shared_regs *regs;
- int i;
-
- regs = kzalloc_node(sizeof(struct intel_shared_regs),
- GFP_KERNEL, cpu_to_node(cpu));
- if (regs) {
- /*
- * initialize the locks to keep lockdep happy
- */
- for (i = 0; i < EXTRA_REG_MAX; i++)
- raw_spin_lock_init(&regs->regs[i].lock);
-
- regs->core_id = -1;
- }
- return regs;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
- if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
- return NOTIFY_OK;
-
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- return NOTIFY_BAD;
-
- return NOTIFY_OK;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int core_id = topology_core_id(cpu);
- int i;
-
- init_debug_store_on_cpu(cpu);
- /*
- * Deal with CPUs that don't clear their LBRs on power-up.
- */
- intel_pmu_lbr_reset();
-
- cpuc->lbr_sel = NULL;
-
- if (!cpuc->shared_regs)
- return;
-
- if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
- for_each_cpu(i, topology_thread_cpumask(cpu)) {
- struct intel_shared_regs *pc;
-
- pc = per_cpu(cpu_hw_events, i).shared_regs;
- if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
- cpuc->shared_regs = pc;
- break;
- }
- }
- cpuc->shared_regs->core_id = core_id;
- cpuc->shared_regs->refcnt++;
- }
-
- if (x86_pmu.lbr_sel_map)
- cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- struct intel_shared_regs *pc;
-
- pc = cpuc->shared_regs;
- if (pc) {
- if (pc->core_id == -1 || --pc->refcnt == 0)
- kfree(pc);
- cpuc->shared_regs = NULL;
- }
-
- fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-static struct attribute *intel_arch3_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_pc.attr,
- &format_attr_any.attr,
- &format_attr_inv.attr,
- &format_attr_cmask.attr,
- &format_attr_in_tx.attr,
- &format_attr_in_tx_cp.attr,
-
- &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
- &format_attr_ldlat.attr, /* PEBS load latency */
- NULL,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
- .name = "Intel",
- .handle_irq = intel_pmu_handle_irq,
- .disable_all = intel_pmu_disable_all,
- .enable_all = intel_pmu_enable_all,
- .enable = intel_pmu_enable_event,
- .disable = intel_pmu_disable_event,
- .hw_config = intel_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .get_event_constraints = intel_get_event_constraints,
- .put_event_constraints = intel_put_event_constraints,
- .pebs_aliases = intel_pebs_aliases_core2,
-
- .format_attrs = intel_arch3_formats_attr,
- .events_sysfs_show = intel_event_sysfs_show,
-
- .cpu_prepare = intel_pmu_cpu_prepare,
- .cpu_starting = intel_pmu_cpu_starting,
- .cpu_dying = intel_pmu_cpu_dying,
- .guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
- /*
- * PEBS is unreliable due to:
- *
- * AJ67 - PEBS may experience CPL leaks
- * AJ68 - PEBS PMI may be delayed by one event
- * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
- * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
- *
- * AJ67 could be worked around by restricting the OS/USR flags.
- * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
- *
- * AJ106 could possibly be worked around by not allowing LBR
- * usage from PEBS, including the fixup.
- * AJ68 could possibly be worked around by always programming
- * a pebs_event_reset[0] value and coping with the lost events.
- *
- * But taken together it might just make sense to not enable PEBS on
- * these chips.
- */
- pr_warn("PEBS disabled due to CPU errata\n");
- x86_pmu.pebs = 0;
- x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
- u32 rev = UINT_MAX; /* default to broken for unknown models */
-
- switch (cpu_data(cpu).x86_model) {
- case 42: /* SNB */
- rev = 0x28;
- break;
-
- case 45: /* SNB-EP */
- switch (cpu_data(cpu).x86_mask) {
- case 6: rev = 0x618; break;
- case 7: rev = 0x70c; break;
- }
- }
-
- return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
- int pebs_broken = 0;
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- if ((pebs_broken = intel_snb_pebs_broken(cpu)))
- break;
- }
- put_online_cpus();
-
- if (pebs_broken == x86_pmu.pebs_broken)
- return;
-
- /*
- * Serialized by the microcode lock..
- */
- if (x86_pmu.pebs_broken) {
- pr_info("PEBS enabled due to microcode update\n");
- x86_pmu.pebs_broken = 0;
- } else {
- pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
- x86_pmu.pebs_broken = 1;
- }
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
- x86_pmu.check_microcode = intel_snb_check_microcode;
- intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
- { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
- { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
- { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
- { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
- { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
- { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
- { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
- int bit;
-
- /* disable event that reported as not presend by cpuid */
- for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
- intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
- pr_warn("CPUID marked event: \'%s\' unavailable\n",
- intel_arch_events_map[bit].name);
- }
-}
-
-static __init void intel_nehalem_quirk(void)
-{
- union cpuid10_ebx ebx;
-
- ebx.full = x86_pmu.events_maskl;
- if (ebx.split.no_branch_misses_retired) {
- /*
- * Erratum AAJ80 detected, we work it around by using
- * the BR_MISP_EXEC.ANY event. This will over-count
- * branch-misses, but it's still much better than the
- * architectural event which is often completely bogus:
- */
- intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
- ebx.split.no_branch_misses_retired = 0;
- x86_pmu.events_maskl = ebx.full;
- pr_info("CPU erratum AAJ80 worked around\n");
- }
-}
-
-EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
-
-static struct attribute *hsw_events_attrs[] = {
- EVENT_PTR(mem_ld_hsw),
- EVENT_PTR(mem_st_hsw),
- NULL
-};
-
-__init int intel_pmu_init(void)
-{
- union cpuid10_edx edx;
- union cpuid10_eax eax;
- union cpuid10_ebx ebx;
- struct event_constraint *c;
- unsigned int unused;
- int version;
-
- if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- switch (boot_cpu_data.x86) {
- case 0x6:
- return p6_pmu_init();
- case 0xb:
- return knc_pmu_init();
- case 0xf:
- return p4_pmu_init();
- }
- return -ENODEV;
- }
-
- /*
- * Check whether the Architectural PerfMon supports
- * Branch Misses Retired hw_event or not.
- */
- cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
- if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
- return -ENODEV;
-
- version = eax.split.version_id;
- if (version < 2)
- x86_pmu = core_pmu;
- else
- x86_pmu = intel_pmu;
-
- x86_pmu.version = version;
- x86_pmu.num_counters = eax.split.num_counters;
- x86_pmu.cntval_bits = eax.split.bit_width;
- x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
-
- x86_pmu.events_maskl = ebx.full;
- x86_pmu.events_mask_len = eax.split.mask_length;
-
- x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
- /*
- * Quirk: v2 perfmon does not report fixed-purpose events, so
- * assume at least 3 events:
- */
- if (version > 1)
- x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
- /*
- * v2 and above have a perf capabilities MSR
- */
- if (version > 1) {
- u64 capabilities;
-
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
- x86_pmu.intel_cap.capabilities = capabilities;
- }
-
- intel_ds_init();
-
- x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
- /*
- * Install the hw-cache-events table:
- */
- switch (boot_cpu_data.x86_model) {
- case 14: /* 65 nm core solo/duo, "Yonah" */
- pr_cont("Core events, ");
- break;
-
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- x86_add_quirk(intel_clovertown_quirk);
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
- memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_core();
-
- x86_pmu.event_constraints = intel_core2_event_constraints;
- x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
- pr_cont("Core2 events, ");
- break;
-
- case 26: /* 45 nm nehalem, "Bloomfield" */
- case 30: /* 45 nm nehalem, "Lynnfield" */
- case 46: /* 45 nm nehalem-ex, "Beckton" */
- memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
-
- intel_pmu_lbr_init_nhm();
-
- x86_pmu.event_constraints = intel_nehalem_event_constraints;
- x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
- x86_pmu.enable_all = intel_pmu_nhm_enable_all;
- x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
- x86_pmu.cpu_events = nhm_events_attrs;
-
- /* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
- X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
- /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
- X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
- x86_add_quirk(intel_nehalem_quirk);
-
- pr_cont("Nehalem events, ");
- break;
-
- case 28: /* Atom */
- case 38: /* Lincroft */
- case 39: /* Penwell */
- case 53: /* Cloverview */
- case 54: /* Cedarview */
- memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- intel_pmu_lbr_init_atom();
-
- x86_pmu.event_constraints = intel_gen_event_constraints;
- x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
- pr_cont("Atom events, ");
- break;
-
- case 37: /* 32 nm nehalem, "Clarkdale" */
- case 44: /* 32 nm nehalem, "Gulftown" */
- case 47: /* 32 nm Xeon E7 */
- memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
-
- intel_pmu_lbr_init_nhm();
-
- x86_pmu.event_constraints = intel_westmere_event_constraints;
- x86_pmu.enable_all = intel_pmu_nhm_enable_all;
- x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
- x86_pmu.extra_regs = intel_westmere_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
-
- x86_pmu.cpu_events = nhm_events_attrs;
-
- /* UOPS_ISSUED.STALLED_CYCLES */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
- X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
- /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
- X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
- pr_cont("Westmere events, ");
- break;
-
- case 42: /* SandyBridge */
- case 45: /* SandyBridge, "Romely-EP" */
- x86_add_quirk(intel_sandybridge_quirk);
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
-
- intel_pmu_lbr_init_snb();
-
- x86_pmu.event_constraints = intel_snb_event_constraints;
- x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
- if (boot_cpu_data.x86_model == 45)
- x86_pmu.extra_regs = intel_snbep_extra_regs;
- else
- x86_pmu.extra_regs = intel_snb_extra_regs;
- /* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
-
- x86_pmu.cpu_events = snb_events_attrs;
-
- /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
- X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
- /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
- X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
- pr_cont("SandyBridge events, ");
- break;
- case 58: /* IvyBridge */
- case 62: /* IvyBridge EP */
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
- sizeof(hw_cache_extra_regs));
-
- intel_pmu_lbr_init_snb();
-
- x86_pmu.event_constraints = intel_ivb_event_constraints;
- x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
- if (boot_cpu_data.x86_model == 62)
- x86_pmu.extra_regs = intel_snbep_extra_regs;
- else
- x86_pmu.extra_regs = intel_snb_extra_regs;
- /* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
-
- x86_pmu.cpu_events = snb_events_attrs;
-
- /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
- intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
- X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
- pr_cont("IvyBridge events, ");
- break;
-
-
- case 60: /* Haswell Client */
- case 70:
- case 71:
- case 63:
- x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
- intel_pmu_lbr_init_snb();
-
- x86_pmu.event_constraints = intel_hsw_event_constraints;
- x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
- x86_pmu.extra_regs = intel_snb_extra_regs;
- x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
- /* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
-
- x86_pmu.hw_config = hsw_hw_config;
- x86_pmu.get_event_constraints = hsw_get_event_constraints;
- x86_pmu.cpu_events = hsw_events_attrs;
- pr_cont("Haswell events, ");
- break;
-
- default:
- switch (x86_pmu.version) {
- case 1:
- x86_pmu.event_constraints = intel_v1_event_constraints;
- pr_cont("generic architected perfmon v1, ");
- break;
- default:
- /*
- * default constraints for v2 and up
- */
- x86_pmu.event_constraints = intel_gen_event_constraints;
- pr_cont("generic architected perfmon, ");
- break;
- }
- }
-
- if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
- x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
- }
- x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
- if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
- }
-
- x86_pmu.intel_ctrl |=
- ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
- if (x86_pmu.event_constraints) {
- /*
- * event on fixed counter2 (REF_CYCLES) only works on this
- * counter, so do not extend mask to generic counters
- */
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != FIXED_EVENT_FLAGS
- || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
- continue;
- }
-
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- c->weight += x86_pmu.num_counters;
- }
- }
-
- /* Support full width counters using alternative MSR range */
- if (x86_pmu.intel_cap.full_width_write) {
- x86_pmu.max_period = x86_pmu.cntval_mask;
- x86_pmu.perfctr = MSR_IA32_PMC0;
- pr_cont("full-width counters, ");
- }
-
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
deleted file mode 100644
index 3065c57a63c1..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,1034 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE 24
-
-#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
- u32 flags, ip;
- u32 ax, bc, cx, dx;
- u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
- u64 val;
- struct {
- unsigned int ld_dse:4;
- unsigned int ld_stlb_miss:1;
- unsigned int ld_locked:1;
- unsigned int ld_reserved:26;
- };
- struct {
- unsigned int st_l1d_hit:1;
- unsigned int st_reserved1:3;
- unsigned int st_stlb_miss:1;
- unsigned int st_locked:1;
- unsigned int st_reserved2:26;
- };
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
- P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
- OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */
- OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */
- OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */
- OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */
- OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */
- OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */
- OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
- OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */
- OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */
- OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
- OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
- OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */
- OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
- union intel_x86_pebs_dse dse;
- u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
- dse.val = status;
-
- /*
- * bit 4: TLB access
- * 1 = stored missed 2nd level TLB
- *
- * so it either hit the walker or the OS
- * otherwise hit 2nd level TLB
- */
- if (dse.st_stlb_miss)
- val |= P(TLB, MISS);
- else
- val |= P(TLB, HIT);
-
- /*
- * bit 0: hit L1 data cache
- * if not set, then all we know is that
- * it missed L1D
- */
- if (dse.st_l1d_hit)
- val |= P(LVL, HIT);
- else
- val |= P(LVL, MISS);
-
- /*
- * bit 5: Locked prefix
- */
- if (dse.st_locked)
- val |= P(LOCK, LOCKED);
-
- return val;
-}
-
-static u64 precise_store_data_hsw(u64 status)
-{
- union perf_mem_data_src dse;
-
- dse.val = 0;
- dse.mem_op = PERF_MEM_OP_STORE;
- dse.mem_lvl = PERF_MEM_LVL_NA;
- if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1;
- /* Nothing else supported. Sorry. */
- return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
- union intel_x86_pebs_dse dse;
- u64 val;
- int model = boot_cpu_data.x86_model;
- int fam = boot_cpu_data.x86;
-
- dse.val = status;
-
- /*
- * use the mapping table for bit 0-3
- */
- val = pebs_data_source[dse.ld_dse];
-
- /*
- * Nehalem models do not support TLB, Lock infos
- */
- if (fam == 0x6 && (model == 26 || model == 30
- || model == 31 || model == 46)) {
- val |= P(TLB, NA) | P(LOCK, NA);
- return val;
- }
- /*
- * bit 4: TLB access
- * 0 = did not miss 2nd level TLB
- * 1 = missed 2nd level TLB
- */
- if (dse.ld_stlb_miss)
- val |= P(TLB, MISS) | P(TLB, L2);
- else
- val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
- /*
- * bit 5: locked prefix
- */
- if (dse.ld_locked)
- val |= P(LOCK, LOCKED);
-
- return val;
-}
-
-struct pebs_record_core {
- u64 flags, ip;
- u64 ax, bx, cx, dx;
- u64 si, di, bp, sp;
- u64 r8, r9, r10, r11;
- u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
- u64 flags, ip;
- u64 ax, bx, cx, dx;
- u64 si, di, bp, sp;
- u64 r8, r9, r10, r11;
- u64 r12, r13, r14, r15;
- u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
- struct pebs_record_nhm nhm;
- /*
- * Real IP of the event. In the Intel documentation this
- * is called eventingrip.
- */
- u64 real_ip;
- /*
- * TSX tuning information field: abort cycles and abort flags.
- */
- u64 tsx_tuning;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
- (u32)((u64)(unsigned long)ds),
- (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
- if (!per_cpu(cpu_hw_events, cpu).ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static int alloc_pebs_buffer(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
- int node = cpu_to_node(cpu);
- int max, thresh = 1; /* always use a single PEBS record */
- void *buffer;
-
- if (!x86_pmu.pebs)
- return 0;
-
- buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!buffer))
- return -ENOMEM;
-
- max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
- ds->pebs_buffer_base = (u64)(unsigned long)buffer;
- ds->pebs_index = ds->pebs_buffer_base;
- ds->pebs_absolute_maximum = ds->pebs_buffer_base +
- max * x86_pmu.pebs_record_size;
-
- ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
- thresh * x86_pmu.pebs_record_size;
-
- return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds || !x86_pmu.pebs)
- return;
-
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
- ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
- int node = cpu_to_node(cpu);
- int max, thresh;
- void *buffer;
-
- if (!x86_pmu.bts)
- return 0;
-
- buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!buffer))
- return -ENOMEM;
-
- max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
- thresh = max / 16;
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum = ds->bts_buffer_base +
- max * BTS_RECORD_SIZE;
- ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
- thresh * BTS_RECORD_SIZE;
-
- return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds || !x86_pmu.bts)
- return;
-
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
- if (unlikely(!ds))
- return -ENOMEM;
-
- per_cpu(cpu_hw_events, cpu).ds = ds;
-
- return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
- int cpu;
-
- if (!x86_pmu.bts && !x86_pmu.pebs)
- return;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- fini_debug_store_on_cpu(cpu);
-
- for_each_possible_cpu(cpu) {
- release_pebs_buffer(cpu);
- release_bts_buffer(cpu);
- release_ds_buffer(cpu);
- }
- put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
- int bts_err = 0, pebs_err = 0;
- int cpu;
-
- x86_pmu.bts_active = 0;
- x86_pmu.pebs_active = 0;
-
- if (!x86_pmu.bts && !x86_pmu.pebs)
- return;
-
- if (!x86_pmu.bts)
- bts_err = 1;
-
- if (!x86_pmu.pebs)
- pebs_err = 1;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- if (alloc_ds_buffer(cpu)) {
- bts_err = 1;
- pebs_err = 1;
- }
-
- if (!bts_err && alloc_bts_buffer(cpu))
- bts_err = 1;
-
- if (!pebs_err && alloc_pebs_buffer(cpu))
- pebs_err = 1;
-
- if (bts_err && pebs_err)
- break;
- }
-
- if (bts_err) {
- for_each_possible_cpu(cpu)
- release_bts_buffer(cpu);
- }
-
- if (pebs_err) {
- for_each_possible_cpu(cpu)
- release_pebs_buffer(cpu);
- }
-
- if (bts_err && pebs_err) {
- for_each_possible_cpu(cpu)
- release_ds_buffer(cpu);
- } else {
- if (x86_pmu.bts && !bts_err)
- x86_pmu.bts_active = 1;
-
- if (x86_pmu.pebs && !pebs_err)
- x86_pmu.pebs_active = 1;
-
- for_each_online_cpu(cpu)
- init_debug_store_on_cpu(cpu);
- }
-
- put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= DEBUGCTLMSR_TR;
- debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
- DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return 0;
-
- if (!x86_pmu.bts_active)
- return 0;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return 0;
-
- memset(&regs, 0, sizeof(regs));
-
- ds->bts_index = ds->bts_buffer_base;
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event, header.size * (top - at)))
- return 1;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
- return 1;
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
- INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
- INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
- INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
- INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
- INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
- INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
- INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
- INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
- INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
- INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
- INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
- INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
- INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
- INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
- INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */
- /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf),
- /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
- INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
- /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
- INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
- INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
- INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
- INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
-
- EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
- struct event_constraint *c;
-
- if (!event->attr.precise_ip)
- return NULL;
-
- if (x86_pmu.pebs_constraints) {
- for_each_event_constraint(c, x86_pmu.pebs_constraints) {
- if ((event->hw.config & c->cmask) == c->code) {
- event->hw.flags |= c->flags;
- return c;
- }
- }
- }
-
- return &emptyconstraint;
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
- cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
- if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
- cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
- else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
- cpuc->pebs_enabled |= 1ULL << 63;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct hw_perf_event *hwc = &event->hw;
-
- cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
- if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
- cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
- else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
- cpuc->pebs_enabled &= ~(1ULL << 63);
-
- if (cpuc->enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
- hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->pebs_enabled)
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long from = cpuc->lbr_entries[0].from;
- unsigned long old_to, to = cpuc->lbr_entries[0].to;
- unsigned long ip = regs->ip;
- int is_64bit = 0;
-
- /*
- * We don't need to fixup if the PEBS assist is fault like
- */
- if (!x86_pmu.intel_cap.pebs_trap)
- return 1;
-
- /*
- * No LBR entry, no basic block, no rewinding
- */
- if (!cpuc->lbr_stack.nr || !from || !to)
- return 0;
-
- /*
- * Basic blocks should never cross user/kernel boundaries
- */
- if (kernel_ip(ip) != kernel_ip(to))
- return 0;
-
- /*
- * unsigned math, either ip is before the start (impossible) or
- * the basic block is larger than 1 page (sanity)
- */
- if ((ip - to) > PAGE_SIZE)
- return 0;
-
- /*
- * We sampled a branch insn, rewind using the LBR stack
- */
- if (ip == to) {
- set_linear_ip(regs, from);
- return 1;
- }
-
- do {
- struct insn insn;
- u8 buf[MAX_INSN_SIZE];
- void *kaddr;
-
- old_to = to;
- if (!kernel_ip(ip)) {
- int bytes, size = MAX_INSN_SIZE;
-
- bytes = copy_from_user_nmi(buf, (void __user *)to, size);
- if (bytes != size)
- return 0;
-
- kaddr = buf;
- } else
- kaddr = (void *)to;
-
-#ifdef CONFIG_X86_64
- is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
- insn_init(&insn, kaddr, is_64bit);
- insn_get_length(&insn);
- to += insn.length;
- } while (to < ip);
-
- if (to == ip) {
- set_linear_ip(regs, old_to);
- return 1;
- }
-
- /*
- * Even though we decoded the basic block, the instruction stream
- * never matched the given IP, either the TO or the IP got corrupted.
- */
- return 0;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
- struct pt_regs *iregs, void *__pebs)
-{
- /*
- * We cast to pebs_record_nhm to get the load latency data
- * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
- */
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct pebs_record_nhm *pebs = __pebs;
- struct pebs_record_hsw *pebs_hsw = __pebs;
- struct perf_sample_data data;
- struct pt_regs regs;
- u64 sample_type;
- int fll, fst;
-
- if (!intel_pmu_save_and_restart(event))
- return;
-
- fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
- fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST |
- PERF_X86_EVENT_PEBS_ST_HSW);
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
-
- data.period = event->hw.last_period;
- sample_type = event->attr.sample_type;
-
- /*
- * if PEBS-LL or PreciseStore
- */
- if (fll || fst) {
- /*
- * Use latency for weight (only avail with PEBS-LL)
- */
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data.weight = pebs->lat;
-
- /*
- * data.data_src encodes the data source
- */
- if (sample_type & PERF_SAMPLE_DATA_SRC) {
- if (fll)
- data.data_src.val = load_latency_data(pebs->dse);
- else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
- data.data_src.val =
- precise_store_data_hsw(pebs->dse);
- else
- data.data_src.val = precise_store_data(pebs->dse);
- }
- }
-
- /*
- * We use the interrupt regs as a base because the PEBS record
- * does not contain a full regs set, specifically it seems to
- * lack segment descriptors, which get used by things like
- * user_mode().
- *
- * In the simple case fix up only the IP and BP,SP regs, for
- * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
- * A possible PERF_SAMPLE_REGS will have to transfer all regs.
- */
- regs = *iregs;
- regs.flags = pebs->flags;
- set_linear_ip(&regs, pebs->ip);
- regs.bp = pebs->bp;
- regs.sp = pebs->sp;
-
- if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
- regs.ip = pebs_hsw->real_ip;
- regs.flags |= PERF_EFLAGS_EXACT;
- } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
- regs.flags |= PERF_EFLAGS_EXACT;
- else
- regs.flags &= ~PERF_EFLAGS_EXACT;
-
- if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
- x86_pmu.intel_cap.pebs_format >= 1)
- data.addr = pebs->dla;
-
- if (has_branch_stack(event))
- data.br_stack = &cpuc->lbr_stack;
-
- if (perf_event_overflow(event, &data, &regs))
- x86_pmu_stop(event, 0);
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct perf_event *event = cpuc->events[0]; /* PMC0 only */
- struct pebs_record_core *at, *top;
- int n;
-
- if (!x86_pmu.pebs_active)
- return;
-
- at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
- /*
- * Whatever else happens, drain the thing
- */
- ds->pebs_index = ds->pebs_buffer_base;
-
- if (!test_bit(0, cpuc->active_mask))
- return;
-
- WARN_ON_ONCE(!event);
-
- if (!event->attr.precise_ip)
- return;
-
- n = top - at;
- if (n <= 0)
- return;
-
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
- at += n - 1;
-
- __intel_pmu_pebs_event(event, iregs, at);
-}
-
-static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
- void *top)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct perf_event *event = NULL;
- u64 status = 0;
- int bit;
-
- ds->pebs_index = ds->pebs_buffer_base;
-
- for (; at < top; at += x86_pmu.pebs_record_size) {
- struct pebs_record_nhm *p = at;
-
- for_each_set_bit(bit, (unsigned long *)&p->status,
- x86_pmu.max_pebs_events) {
- event = cpuc->events[bit];
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- WARN_ON_ONCE(!event);
-
- if (!event->attr.precise_ip)
- continue;
-
- if (__test_and_set_bit(bit, (unsigned long *)&status))
- continue;
-
- break;
- }
-
- if (!event || bit >= x86_pmu.max_pebs_events)
- continue;
-
- __intel_pmu_pebs_event(event, iregs, at);
- }
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct pebs_record_nhm *at, *top;
- int n;
-
- if (!x86_pmu.pebs_active)
- return;
-
- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
- ds->pebs_index = ds->pebs_buffer_base;
-
- n = top - at;
- if (n <= 0)
- return;
-
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > x86_pmu.max_pebs_events,
- "Unexpected number of pebs records %d\n", n);
-
- return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
-static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct pebs_record_hsw *at, *top;
- int n;
-
- if (!x86_pmu.pebs_active)
- return;
-
- at = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index;
-
- n = top - at;
- if (n <= 0)
- return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > x86_pmu.max_pebs_events,
- "Unexpected number of pebs records %d\n", n);
-
- return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void intel_ds_init(void)
-{
- /*
- * No support for 32bit formats
- */
- if (!boot_cpu_has(X86_FEATURE_DTES64))
- return;
-
- x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
- x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
- if (x86_pmu.pebs) {
- char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
- int format = x86_pmu.intel_cap.pebs_format;
-
- switch (format) {
- case 0:
- printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
- x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
- break;
-
- case 1:
- printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
- x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
- break;
-
- case 2:
- pr_cont("PEBS fmt2%c, ", pebs_type);
- x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw;
- break;
-
- default:
- printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
- x86_pmu.pebs = 0;
- }
- }
-}
-
-void perf_restore_debug_store(void)
-{
- struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
- if (!x86_pmu.bts && !x86_pmu.pebs)
- return;
-
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
deleted file mode 100644
index d5be06a5005e..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,761 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
- LBR_FORMAT_32 = 0x00,
- LBR_FORMAT_LIP = 0x01,
- LBR_FORMAT_EIP = 0x02,
- LBR_FORMAT_EIP_FLAGS = 0x03,
- LBR_FORMAT_EIP_FLAGS2 = 0x04,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
-};
-
-static enum {
- LBR_EIP_FLAGS = 1,
- LBR_TSX = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
- [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
- [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
-#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT 2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT 5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
-#define LBR_FAR_BIT 8 /* do not capture far branches */
-
-#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
-#define LBR_USER (1 << LBR_USER_BIT)
-#define LBR_JCC (1 << LBR_JCC_BIT)
-#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN (1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
-#define LBR_FAR (1 << LBR_FAR_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP -1 /* LBR filter not supported */
-#define LBR_IGN 0 /* ignored */
-
-#define LBR_ANY \
- (LBR_JCC |\
- LBR_REL_CALL |\
- LBR_IND_CALL |\
- LBR_RETURN |\
- LBR_REL_JMP |\
- LBR_IND_JMP |\
- LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT (1ULL << 61)
-
-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY \
- (X86_BR_CALL |\
- X86_BR_RET |\
- X86_BR_SYSCALL |\
- X86_BR_SYSRET |\
- X86_BR_INT |\
- X86_BR_IRET |\
- X86_BR_JCC |\
- X86_BR_JMP |\
- X86_BR_IRQ |\
- X86_BR_ABORT |\
- X86_BR_IND_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL \
- (X86_BR_CALL |\
- X86_BR_IND_CALL |\
- X86_BR_SYSCALL |\
- X86_BR_IRQ |\
- X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(void)
-{
- u64 debugctl;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->lbr_sel)
- wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
- u64 debugctl;
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++)
- wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- wrmsrl(x86_pmu.lbr_from + i, 0);
- wrmsrl(x86_pmu.lbr_to + i, 0);
- }
-}
-
-void intel_pmu_lbr_reset(void)
-{
- if (!x86_pmu.lbr_nr)
- return;
-
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
- intel_pmu_lbr_reset_32();
- else
- intel_pmu_lbr_reset_64();
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu.lbr_nr)
- return;
-
- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
- cpuc->br_sel = event->hw.branch_reg.reg;
-
- cpuc->lbr_users++;
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!x86_pmu.lbr_nr)
- return;
-
- cpuc->lbr_users--;
- WARN_ON_ONCE(cpuc->lbr_users < 0);
-
- if (cpuc->enabled && !cpuc->lbr_users) {
- __intel_pmu_lbr_disable();
- /* avoid stale pointer */
- cpuc->lbr_context = NULL;
- }
-}
-
-void intel_pmu_lbr_enable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (cpuc->lbr_users)
- __intel_pmu_lbr_disable();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
- unsigned long mask = x86_pmu.lbr_nr - 1;
- u64 tos = intel_pmu_lbr_tos();
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- unsigned long lbr_idx = (tos - i) & mask;
- union {
- struct {
- u32 from;
- u32 to;
- };
- u64 lbr;
- } msr_lastbranch;
-
- rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
- cpuc->lbr_entries[i].from = msr_lastbranch.from;
- cpuc->lbr_entries[i].to = msr_lastbranch.to;
- cpuc->lbr_entries[i].mispred = 0;
- cpuc->lbr_entries[i].predicted = 0;
- cpuc->lbr_entries[i].reserved = 0;
- }
- cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
- unsigned long mask = x86_pmu.lbr_nr - 1;
- int lbr_format = x86_pmu.intel_cap.lbr_format;
- u64 tos = intel_pmu_lbr_tos();
- int i;
-
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
- unsigned long lbr_idx = (tos - i) & mask;
- u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
- int skip = 0;
- int lbr_flags = lbr_desc[lbr_format];
-
- rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
- rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
-
- if (lbr_flags & LBR_EIP_FLAGS) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- }
- if (lbr_flags & LBR_TSX) {
- in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
- abort = !!(from & LBR_FROM_FLAG_ABORT);
- skip = 3;
- }
- from = (u64)((((s64)from) << skip) >> skip);
-
- cpuc->lbr_entries[i].from = from;
- cpuc->lbr_entries[i].to = to;
- cpuc->lbr_entries[i].mispred = mis;
- cpuc->lbr_entries[i].predicted = pred;
- cpuc->lbr_entries[i].in_tx = in_tx;
- cpuc->lbr_entries[i].abort = abort;
- cpuc->lbr_entries[i].reserved = 0;
- }
- cpuc->lbr_stack.nr = i;
-}
-
-void intel_pmu_lbr_read(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- if (!cpuc->lbr_users)
- return;
-
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
- intel_pmu_lbr_read_32(cpuc);
- else
- intel_pmu_lbr_read_64(cpuc);
-
- intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
- u64 br_type = event->attr.branch_sample_type;
- int mask = 0;
-
- if (br_type & PERF_SAMPLE_BRANCH_USER)
- mask |= X86_BR_USER;
-
- if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
- mask |= X86_BR_KERNEL;
-
- /* we ignore BRANCH_HV here */
-
- if (br_type & PERF_SAMPLE_BRANCH_ANY)
- mask |= X86_BR_ANY;
-
- if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
- mask |= X86_BR_ANY_CALL;
-
- if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
- mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
- if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
- mask |= X86_BR_IND_CALL;
-
- if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
- mask |= X86_BR_ABORT;
-
- if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
- mask |= X86_BR_IN_TX;
-
- if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
- mask |= X86_BR_NO_TX;
-
- /*
- * stash actual user request into reg, it may
- * be used by fixup code for some CPU
- */
- event->hw.branch_reg.reg = mask;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
- struct hw_perf_event_extra *reg;
- u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
-
- for_each_branch_sample_type(m) {
- if (!(br_type & m))
- continue;
-
- v = x86_pmu.lbr_sel_map[m];
- if (v == LBR_NOT_SUPP)
- return -EOPNOTSUPP;
-
- if (v != LBR_IGN)
- mask |= v;
- }
- reg = &event->hw.branch_reg;
- reg->idx = EXTRA_REG_LBR;
-
- /* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
-
- return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
- int ret = 0;
-
- /*
- * no LBR on this PMU
- */
- if (!x86_pmu.lbr_nr)
- return -EOPNOTSUPP;
-
- /*
- * setup SW LBR filter
- */
- intel_pmu_setup_sw_lbr_filter(event);
-
- /*
- * setup HW LBR filter, if any
- */
- if (x86_pmu.lbr_sel_map)
- ret = intel_pmu_setup_hw_lbr_filter(event);
-
- return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
- struct insn insn;
- void *addr;
- int bytes, size = MAX_INSN_SIZE;
- int ret = X86_BR_NONE;
- int ext, to_plm, from_plm;
- u8 buf[MAX_INSN_SIZE];
- int is64 = 0;
-
- to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
- from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
- /*
- * maybe zero if lbr did not fill up after a reset by the time
- * we get a PMU interrupt
- */
- if (from == 0 || to == 0)
- return X86_BR_NONE;
-
- if (abort)
- return X86_BR_ABORT | to_plm;
-
- if (from_plm == X86_BR_USER) {
- /*
- * can happen if measuring at the user level only
- * and we interrupt in a kernel thread, e.g., idle.
- */
- if (!current->mm)
- return X86_BR_NONE;
-
- /* may fail if text not present */
- bytes = copy_from_user_nmi(buf, (void __user *)from, size);
- if (bytes != size)
- return X86_BR_NONE;
-
- addr = buf;
- } else {
- /*
- * The LBR logs any address in the IP, even if the IP just
- * faulted. This means userspace can control the from address.
- * Ensure we don't blindy read any address by validating it is
- * a known text address.
- */
- if (kernel_text_address(from))
- addr = (void *)from;
- else
- return X86_BR_NONE;
- }
-
- /*
- * decoder needs to know the ABI especially
- * on 64-bit systems running 32-bit apps
- */
-#ifdef CONFIG_X86_64
- is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
- insn_init(&insn, addr, is64);
- insn_get_opcode(&insn);
-
- switch (insn.opcode.bytes[0]) {
- case 0xf:
- switch (insn.opcode.bytes[1]) {
- case 0x05: /* syscall */
- case 0x34: /* sysenter */
- ret = X86_BR_SYSCALL;
- break;
- case 0x07: /* sysret */
- case 0x35: /* sysexit */
- ret = X86_BR_SYSRET;
- break;
- case 0x80 ... 0x8f: /* conditional */
- ret = X86_BR_JCC;
- break;
- default:
- ret = X86_BR_NONE;
- }
- break;
- case 0x70 ... 0x7f: /* conditional */
- ret = X86_BR_JCC;
- break;
- case 0xc2: /* near ret */
- case 0xc3: /* near ret */
- case 0xca: /* far ret */
- case 0xcb: /* far ret */
- ret = X86_BR_RET;
- break;
- case 0xcf: /* iret */
- ret = X86_BR_IRET;
- break;
- case 0xcc ... 0xce: /* int */
- ret = X86_BR_INT;
- break;
- case 0xe8: /* call near rel */
- case 0x9a: /* call far absolute */
- ret = X86_BR_CALL;
- break;
- case 0xe0 ... 0xe3: /* loop jmp */
- ret = X86_BR_JCC;
- break;
- case 0xe9 ... 0xeb: /* jmp */
- ret = X86_BR_JMP;
- break;
- case 0xff: /* call near absolute, call far absolute ind */
- insn_get_modrm(&insn);
- ext = (insn.modrm.bytes[0] >> 3) & 0x7;
- switch (ext) {
- case 2: /* near ind call */
- case 3: /* far ind call */
- ret = X86_BR_IND_CALL;
- break;
- case 4:
- case 5:
- ret = X86_BR_JMP;
- break;
- }
- break;
- default:
- ret = X86_BR_NONE;
- }
- /*
- * interrupts, traps, faults (and thus ring transition) may
- * occur on any instructions. Thus, to classify them correctly,
- * we need to first look at the from and to priv levels. If they
- * are different and to is in the kernel, then it indicates
- * a ring transition. If the from instruction is not a ring
- * transition instr (syscall, systenter, int), then it means
- * it was a irq, trap or fault.
- *
- * we have no way of detecting kernel to kernel faults.
- */
- if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
- && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
- ret = X86_BR_IRQ;
-
- /*
- * branch priv level determined by target as
- * is done by HW when LBR_SELECT is implemented
- */
- if (ret != X86_BR_NONE)
- ret |= to_plm;
-
- return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
- u64 from, to;
- int br_sel = cpuc->br_sel;
- int i, j, type;
- bool compress = false;
-
- /* if sampling all branches, then nothing to filter */
- if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
- return;
-
- for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
- from = cpuc->lbr_entries[i].from;
- to = cpuc->lbr_entries[i].to;
-
- type = branch_type(from, to, cpuc->lbr_entries[i].abort);
- if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
- if (cpuc->lbr_entries[i].in_tx)
- type |= X86_BR_IN_TX;
- else
- type |= X86_BR_NO_TX;
- }
-
- /* if type does not correspond, then discard */
- if (type == X86_BR_NONE || (br_sel & type) != type) {
- cpuc->lbr_entries[i].from = 0;
- compress = true;
- }
- }
-
- if (!compress)
- return;
-
- /* remove all entries with from=0 */
- for (i = 0; i < cpuc->lbr_stack.nr; ) {
- if (!cpuc->lbr_entries[i].from) {
- j = i;
- while (++j < cpuc->lbr_stack.nr)
- cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
- cpuc->lbr_stack.nr--;
- if (!cpuc->lbr_entries[i].from)
- continue;
- }
- i++;
- }
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
- /*
- * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
- */
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
- LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
- /*
- * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
- */
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
-};
-
-/* core */
-void intel_pmu_lbr_init_core(void)
-{
- x86_pmu.lbr_nr = 4;
- x86_pmu.lbr_tos = MSR_LBR_TOS;
- x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
- x86_pmu.lbr_to = MSR_LBR_CORE_TO;
-
- /*
- * SW branch filter usage:
- * - compensate for lack of HW filter
- */
- pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void intel_pmu_lbr_init_nhm(void)
-{
- x86_pmu.lbr_nr = 16;
- x86_pmu.lbr_tos = MSR_LBR_TOS;
- x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
- x86_pmu.lbr_to = MSR_LBR_NHM_TO;
-
- x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
- x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
-
- /*
- * SW branch filter usage:
- * - workaround LBR_SEL errata (see above)
- * - support syscall, sysret capture.
- * That requires LBR_FAR but that means far
- * jmp need to be filtered out
- */
- pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void intel_pmu_lbr_init_snb(void)
-{
- x86_pmu.lbr_nr = 16;
- x86_pmu.lbr_tos = MSR_LBR_TOS;
- x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
- x86_pmu.lbr_to = MSR_LBR_NHM_TO;
-
- x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
- x86_pmu.lbr_sel_map = snb_lbr_sel_map;
-
- /*
- * SW branch filter usage:
- * - support syscall, sysret capture.
- * That requires LBR_FAR but that means far
- * jmp need to be filtered out
- */
- pr_cont("16-deep LBR, ");
-}
-
-/* atom */
-void intel_pmu_lbr_init_atom(void)
-{
- /*
- * only models starting at stepping 10 seems
- * to have an operational LBR which can freeze
- * on PMU interrupt
- */
- if (boot_cpu_data.x86_model == 28
- && boot_cpu_data.x86_mask < 10) {
- pr_cont("LBR disabled due to erratum");
- return;
- }
-
- x86_pmu.lbr_nr = 8;
- x86_pmu.lbr_tos = MSR_LBR_TOS;
- x86_pmu.lbr_from = MSR_LBR_CORE_FROM;
- x86_pmu.lbr_to = MSR_LBR_CORE_TO;
-
- /*
- * SW branch filter usage:
- * - compensate for lack of HW filter
- */
- pr_cont("8-deep LBR, ");
-}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
deleted file mode 100644
index 9dd99751ccf9..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,3658 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-static struct intel_uncore_type **msr_uncores = empty_uncore;
-static struct intel_uncore_type **pci_uncores = empty_uncore;
-/* pci bus to socket mapping */
-static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint constraint_fixed =
- EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-static struct event_constraint constraint_empty =
- EVENT_CONSTRAINT(0, 0, 0);
-
-#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
- ((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-
-static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- u64 count;
-
- rdmsrl(event->hw.event_base, count);
-
- return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-static struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_extra_reg *er;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- unsigned long flags;
- bool ok = false;
-
- /*
- * reg->alloc can be set due to existing state, so for fake box we
- * need to ignore this, otherwise we might fail to allocate proper
- * fake state for this extra reg constraint.
- */
- if (reg1->idx == EXTRA_REG_NONE ||
- (!uncore_box_is_fake(box) && reg1->alloc))
- return NULL;
-
- er = &box->shared_regs[reg1->idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!atomic_read(&er->ref) ||
- (er->config1 == reg1->config && er->config2 == reg2->config)) {
- atomic_inc(&er->ref);
- er->config1 = reg1->config;
- er->config2 = reg2->config;
- ok = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (ok) {
- if (!uncore_box_is_fake(box))
- reg1->alloc = 1;
- return NULL;
- }
-
- return &constraint_empty;
-}
-
-static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_extra_reg *er;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
- /*
- * Only put constraint if extra reg was actually allocated. Also
- * takes care of event which do not use an extra shared reg.
- *
- * Also, if this is a fake box we shouldn't touch any event state
- * (reg->alloc) and we don't care about leaving inconsistent box
- * state either since it will be thrown out.
- */
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- er = &box->shared_regs[reg1->idx];
- atomic_dec(&er->ref);
- reg1->alloc = 0;
-}
-
-static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- u64 config;
-
- er = &box->shared_regs[idx];
-
- raw_spin_lock_irqsave(&er->lock, flags);
- config = er->config;
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return config;
-}
-
-/* Sandy Bridge-EP uncore support */
-static struct intel_uncore_type snbep_uncore_cbox;
-static struct intel_uncore_type snbep_uncore_pcu;
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- u64 count = 0;
-
- pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
- pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
- return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE)
- wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
-
- if (msr)
- wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_nid.attr,
- &format_attr_filter_state.attr,
- &format_attr_filter_opc.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
- INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
- INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
- INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
- .name = "format",
- .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
- .name = "format",
- .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
- .name = "format",
- .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
- .init_box = snbep_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_uncore_pci_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
- EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &snbep_uncore_msr_ops,
- .format_group = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
- EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i;
-
- if (uncore_box_is_fake(box))
- return;
-
- for (i = 0; i < 5; i++) {
- if (reg1->alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
- u64 (*cbox_filter_mask)(int fields))
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i, alloc = 0;
- unsigned long flags;
- u64 mask;
-
- if (reg1->idx == EXTRA_REG_NONE)
- return NULL;
-
- raw_spin_lock_irqsave(&er->lock, flags);
- for (i = 0; i < 5; i++) {
- if (!(reg1->idx & (0x1 << i)))
- continue;
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- continue;
-
- mask = cbox_filter_mask(0x1 << i);
- if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
- !((reg1->config ^ er->config) & mask)) {
- atomic_add(1 << (i * 6), &er->ref);
- er->config &= ~mask;
- er->config |= reg1->config & mask;
- alloc |= (0x1 << i);
- } else {
- break;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
- if (i < 5)
- goto fail;
-
- if (!uncore_box_is_fake(box))
- reg1->alloc |= alloc;
-
- return NULL;
-fail:
- for (; i >= 0; i--) {
- if (alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- return &constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x4)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_cbox_hw_config,
- .get_constraint = snbep_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &snbep_uncore_cbox_ops,
- .format_group = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 config = reg1->config;
-
- if (new_idx > reg1->idx)
- config <<= 8 * (new_idx - reg1->idx);
- else
- config >>= 8 * (reg1->idx - new_idx);
-
- if (modify) {
- hwc->config += new_idx - reg1->idx;
- reg1->config = config;
- reg1->idx = new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- unsigned long flags;
- int idx = reg1->idx;
- u64 mask, config1 = reg1->config;
- bool ok = false;
-
- if (reg1->idx == EXTRA_REG_NONE ||
- (!uncore_box_is_fake(box) && reg1->alloc))
- return NULL;
-again:
- mask = 0xffULL << (idx * 8);
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
- !((config1 ^ er->config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- idx = (idx + 1) % 4;
- if (idx != reg1->idx) {
- config1 = snbep_pcu_alter_er(event, idx, false);
- goto again;
- }
- return &constraint_empty;
- }
-
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx)
- snbep_pcu_alter_er(event, idx, true);
- reg1->alloc = 1;
- }
- return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- atomic_sub(1 << (reg1->idx * 8), &er->ref);
- reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
- if (ev_sel >= 0xb && ev_sel <= 0xe) {
- reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
- reg1->idx = ev_sel - 0xb;
- reg1->config = event->attr.config1 & (0xff << reg1->idx);
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &snbep_uncore_pcu_ops,
- .format_group = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
- &snbep_uncore_ubox,
- &snbep_uncore_cbox,
- &snbep_uncore_pcu,
- NULL,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &snbep_uncore_pci_ops, \
- .format_group = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 4,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- .event_descs = snbep_uncore_imc_events,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &snbep_uncore_pci_ops,
- .event_descs = snbep_uncore_qpi_events,
- .format_group = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- SNBEP_PCI_UNCORE_HA,
- SNBEP_PCI_UNCORE_IMC,
- SNBEP_PCI_UNCORE_QPI,
- SNBEP_PCI_UNCORE_R2PCIE,
- SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
- [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
- [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
- [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
- [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
- [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
- NULL,
-};
-
-static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
- { /* Home Agent */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
- .driver_data = SNBEP_PCI_UNCORE_HA,
- },
- { /* MC Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
- },
- { /* MC Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
- },
- { /* MC Channel 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
- },
- { /* MC Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
- .driver_data = SNBEP_PCI_UNCORE_IMC,
- },
- { /* QPI Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
- .driver_data = SNBEP_PCI_UNCORE_QPI,
- },
- { /* QPI Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
- .driver_data = SNBEP_PCI_UNCORE_QPI,
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
- .driver_data = SNBEP_PCI_UNCORE_R2PCIE,
- },
- { /* R3QPI Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
- .driver_data = SNBEP_PCI_UNCORE_R3QPI,
- },
- { /* R3QPI Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
- .driver_data = SNBEP_PCI_UNCORE_R3QPI,
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
- .name = "snbep_uncore",
- .id_table = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
- struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid;
- int err = 0;
- u32 config = 0;
-
- while (1) {
- /* find the UBOX device */
- ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
- if (!ubox_dev)
- break;
- bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, 0x40, &config);
- if (err)
- break;
- nodeid = config;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, 0x54, &config);
- if (err)
- break;
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- pcibus_to_physid[bus] = i;
- break;
- }
- }
- }
-
- if (ubox_dev)
- pci_dev_put(ubox_dev);
-
- return err ? pcibios_err_to_errno(err) : 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivt_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- if (msr)
- wrmsrl(msr, IVT_PMON_BOX_CTL_INT);
-}
-
-static void ivt_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT);
-}
-
-#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = ivt_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivt_uncore_msr_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivt_uncore_pci_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_uncore_pci_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
-};
-
-#define IVT_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = IVT_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &ivt_uncore_pci_ops, \
- .format_group = &ivt_uncore_format_group
-
-static struct attribute *ivt_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_link.attr,
- &format_attr_filter_state2.attr,
- &format_attr_filter_nid2.attr,
- &format_attr_filter_opc2.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute_group ivt_uncore_format_group = {
- .name = "format",
- .attrs = ivt_uncore_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_ubox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_cbox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_pcu_format_group = {
- .name = "format",
- .attrs = ivt_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_qpi_format_group = {
- .name = "format",
- .attrs = ivt_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivt_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &ivt_uncore_msr_ops,
- .format_group = &ivt_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
- EVENT_EXTRA_END
-};
-
-static u64 ivt_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK;
- if (fields & 0x4)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x10)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask);
-}
-
-static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- u64 filter = uncore_shared_reg_config(box, 0);
- wrmsrl(reg1->reg, filter & 0xffffffff);
- wrmsrl(reg1->reg + 6, filter >> 32);
- }
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivt_uncore_cbox_ops = {
- .init_box = ivt_uncore_msr_init_box,
- .disable_box = snbep_uncore_msr_disable_box,
- .enable_box = snbep_uncore_msr_enable_box,
- .disable_event = snbep_uncore_msr_disable_event,
- .enable_event = ivt_cbox_enable_event,
- .read_counter = uncore_msr_read_counter,
- .hw_config = ivt_cbox_hw_config,
- .get_constraint = ivt_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 15,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &ivt_uncore_cbox_ops,
- .format_group = &ivt_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_pcu_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &ivt_uncore_pcu_ops,
- .format_group = &ivt_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivt_msr_uncores[] = {
- &ivt_uncore_ubox,
- &ivt_uncore_cbox,
- &ivt_uncore_pcu,
- NULL,
-};
-
-static struct intel_uncore_type ivt_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 3,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &ivt_uncore_pci_ops,
- .format_group = &ivt_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivt_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- IVT_PCI_UNCORE_HA,
- IVT_PCI_UNCORE_IMC,
- IVT_PCI_UNCORE_QPI,
- IVT_PCI_UNCORE_R2PCIE,
- IVT_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivt_pci_uncores[] = {
- [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha,
- [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc,
- [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi,
- [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie,
- [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi,
- NULL,
-};
-
-static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
- { /* Home Agent 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
- .driver_data = IVT_PCI_UNCORE_HA,
- },
- { /* Home Agent 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
- .driver_data = IVT_PCI_UNCORE_HA,
- },
- { /* MC0 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC0 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC0 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC0 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC1 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC1 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC1 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* MC1 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
- .driver_data = IVT_PCI_UNCORE_IMC,
- },
- { /* QPI0 Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
- .driver_data = IVT_PCI_UNCORE_QPI,
- },
- { /* QPI0 Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
- .driver_data = IVT_PCI_UNCORE_QPI,
- },
- { /* QPI1 Port 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
- .driver_data = IVT_PCI_UNCORE_QPI,
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
- .driver_data = IVT_PCI_UNCORE_R2PCIE,
- },
- { /* R3QPI0 Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
- },
- { /* R3QPI0 Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
- },
- { /* R3QPI1 Link 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
- .driver_data = IVT_PCI_UNCORE_R3QPI,
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivt_uncore_pci_driver = {
- .name = "ivt_uncore",
- .id_table = ivt_uncore_pci_ids,
-};
-/* end of IvyTown uncore support */
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- if (box->pmu->pmu_idx == 0) {
- wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
- SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
- }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask5.attr,
- NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
- .name = "format",
- .attrs = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
- .init_box = snb_uncore_msr_init_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = snb_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
- .name = "cbox",
- .num_counters = 2,
- .num_boxes = 4,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
- .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
- .fixed_ctr = SNB_UNC_FIXED_CTR,
- .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
- .single_fixed = 1,
- .event_mask = SNB_UNC_RAW_EVENT_MASK,
- .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
- .constraints = snb_uncore_cbox_constraints,
- .ops = &snb_uncore_msr_ops,
- .format_group = &snb_uncore_format_group,
- .event_descs = snb_uncore_events,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
- &snb_uncore_cbox,
- NULL,
-};
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask8.attr,
- NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
- .name = "format",
- .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
- .disable_box = nhm_uncore_msr_disable_box,
- .enable_box = nhm_uncore_msr_enable_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = nhm_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
- .name = "",
- .num_counters = 8,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .event_ctl = NHM_UNC_PERFEVTSEL0,
- .perf_ctr = NHM_UNC_UNCORE_PMC0,
- .fixed_ctr = NHM_UNC_FIXED_CTR,
- .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
- .event_mask = NHM_UNC_RAW_EVENT_MASK,
- .event_descs = nhm_uncore_events,
- .ops = &nhm_uncore_msr_ops,
- .format_group = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
- &nhm_uncore,
- NULL,
-};
-/* end of Nehalem uncore support */
-
-/* Nehalem-EX uncore support */
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config &= ~((1ULL << uncore_num_counters(box)) - 1);
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config |= (1ULL << uncore_num_counters(box)) - 1;
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
- else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
- else
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT() \
- .init_box = nhmex_uncore_msr_init_box, \
- .disable_box = nhmex_uncore_msr_disable_box, \
- .enable_box = nhmex_uncore_msr_enable_box, \
- .disable_event = nhmex_uncore_msr_disable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_edge.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
- .name = "ubox",
- .num_counters = 1,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
- .perf_ctr = NHMEX_U_MSR_PMON_CTR,
- .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
- 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
- .name = "cbox",
- .num_counters = 6,
- .num_boxes = 10,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
- .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
- .msr_offsets = nhmex_cbox_msr_offsets,
- .pair_ctr_ctl = 1,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
- .name = "wbox",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_W_MSR_PMON_CNT0,
- .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
- .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
- .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
- .pair_ctr_ctl = 1,
- .event_descs = nhmex_uncore_wbox_events,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int ctr, ev_sel;
-
- ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
- NHMEX_B_PMON_CTR_SHIFT;
- ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
- /* events that do not use the match/mask registers */
- if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
- (ctr == 2 && ev_sel != 0x4) || ctr == 3)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_B0_MSR_MATCH;
- else
- reg1->reg = NHMEX_B1_MSR_MATCH;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, reg1->config);
- wrmsrl(reg1->reg + 1, reg2->config);
- }
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
- EVENT_CONSTRAINT(0 , 1, 0xc0),
- EVENT_CONSTRAINT(0x40, 2, 0xc0),
- EVENT_CONSTRAINT(0x80, 4, 0xc0),
- EVENT_CONSTRAINT(0xc0, 8, 0xc0),
- EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_counter.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_bbox_msr_enable_event,
- .hw_config = nhmex_bbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
- .name = "bbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
- .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_B_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .constraints = nhmex_uncore_bbox_constraints,
- .ops = &nhmex_uncore_bbox_ops,
- .format_group = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- /* only TO_R_PROG_EV event uses the match/mask register */
- if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
- NHMEX_S_EVENT_TO_R_PROG_EV)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_S0_MSR_MM_CFG;
- else
- reg1->reg = NHMEX_S1_MSR_MM_CFG;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, 0);
- wrmsrl(reg1->reg + 1, reg1->config);
- wrmsrl(reg1->reg + 2, reg2->config);
- wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
- }
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_sbox_msr_enable_event,
- .hw_config = nhmex_sbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
- .name = "sbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_S_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .ops = &nhmex_uncore_sbox_ops,
- .format_group = &nhmex_uncore_sbox_format_group
-};
-
-enum {
- EXTRA_REG_NHMEX_M_FILTER,
- EXTRA_REG_NHMEX_M_DSP,
- EXTRA_REG_NHMEX_M_ISS,
- EXTRA_REG_NHMEX_M_MAP,
- EXTRA_REG_NHMEX_M_MSC_THR,
- EXTRA_REG_NHMEX_M_PGT,
- EXTRA_REG_NHMEX_M_PLD,
- EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
- MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
- MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
- /* event 0xa uses two extra registers */
- MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
- MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
- MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
- /* events 0xd ~ 0x10 use the same extra register */
- MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
- EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- bool ret = false;
- u64 mask;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!atomic_read(&er->ref) || er->config == config) {
- atomic_inc(&er->ref);
- er->config = config;
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
- }
- /*
- * The ZDP_CTL_FVC MSR has 4 fields which are used to control
- * events 0xd ~ 0x10. Besides these 4 fields, there are additional
- * fields which are shared.
- */
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (WARN_ON_ONCE(idx >= 4))
- return false;
-
- /* mask of the shared fields */
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
- raw_spin_lock_irqsave(&er->lock, flags);
- /* add mask of the non-shared field if it's in use */
- if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
- if (uncore_nhmex)
- mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- }
-
- if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
- NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
- WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- er->config &= ~mask;
- er->config |= (config & mask);
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- atomic_dec(&er->ref);
- return;
- }
-
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
- u64 config = reg1->config;
-
- /* get the non-shared control bits and shift them */
- idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (uncore_nhmex)
- config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- if (new_idx > orig_idx) {
- idx = new_idx - orig_idx;
- config <<= 3 * idx;
- } else {
- idx = orig_idx - new_idx;
- config >>= 3 * idx;
- }
-
- /* add the shared control bits back */
- if (uncore_nhmex)
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- else
- config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- if (modify) {
- /* adjust the main event selector */
- if (new_idx > orig_idx)
- hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- else
- hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- reg1->config = config;
- reg1->idx = ~0xff | new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int i, idx[2], alloc = 0;
- u64 config1 = reg1->config;
-
- idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
- idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
- for (i = 0; i < 2; i++) {
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- idx[i] = 0xff;
-
- if (idx[i] == 0xff)
- continue;
-
- if (!nhmex_mbox_get_shared_reg(box, idx[i],
- __BITS_VALUE(config1, i, 32)))
- goto fail;
- alloc |= (0x1 << i);
- }
-
- /* for the match/mask registers */
- if (reg2->idx != EXTRA_REG_NONE &&
- (uncore_box_is_fake(box) || !reg2->alloc) &&
- !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
- goto fail;
-
- /*
- * If it's a fake box -- as per validate_{group,event}() we
- * shouldn't touch event state and we can avoid doing so
- * since both will only call get_event_constraints() once
- * on each event, this avoids the need for reg->alloc.
- */
- if (!uncore_box_is_fake(box)) {
- if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
- nhmex_mbox_alter_er(event, idx[0], true);
- reg1->alloc |= alloc;
- if (reg2->idx != EXTRA_REG_NONE)
- reg2->alloc = 1;
- }
- return NULL;
-fail:
- if (idx[0] != 0xff && !(alloc & 0x1) &&
- idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- /*
- * events 0xd ~ 0x10 are functional identical, but are
- * controlled by different fields in the ZDP_CTL_FVC
- * register. If we failed to take one field, try the
- * rest 3 choices.
- */
- BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
- idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- idx[0] = (idx[0] + 1) % 4;
- idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
- config1 = nhmex_mbox_alter_er(event, idx[0], false);
- goto again;
- }
- }
-
- if (alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, idx[0]);
- if (alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, idx[1]);
- return &constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
- if (uncore_box_is_fake(box))
- return;
-
- if (reg1->alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
- if (reg1->alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
- reg1->alloc = 0;
-
- if (reg2->alloc) {
- nhmex_mbox_put_shared_reg(box, reg2->idx);
- reg2->alloc = 0;
- }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
- if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return er->idx;
- return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_type *type = box->pmu->type;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- struct extra_reg *er;
- unsigned msr;
- int reg_idx = 0;
- /*
- * The mbox events may require 2 extra MSRs at the most. But only
- * the lower 32 bits in these MSRs are significant, so we can use
- * config1 to pass two MSRs' config.
- */
- for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- if (event->attr.config1 & ~er->valid_mask)
- return -EINVAL;
-
- msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
- if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
- return -EINVAL;
-
- /* always use the 32~63 bits to pass the PLD config */
- if (er->idx == EXTRA_REG_NHMEX_M_PLD)
- reg_idx = 1;
- else if (WARN_ON_ONCE(reg_idx > 0))
- return -EINVAL;
-
- reg1->idx &= ~(0xff << (reg_idx * 8));
- reg1->reg &= ~(0xffff << (reg_idx * 16));
- reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
- reg1->reg |= msr << (reg_idx * 16);
- reg1->config = event->attr.config1;
- reg_idx++;
- }
- /*
- * The mbox only provides ability to perform address matching
- * for the PLD events.
- */
- if (reg_idx == 2) {
- reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
- if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
- reg2->config = event->attr.config2;
- else
- reg2->config = ~0ULL;
- if (box->pmu->pmu_idx == 0)
- reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
- else
- reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
- }
- return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- u64 config;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return box->shared_regs[idx].config;
-
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- raw_spin_lock_irqsave(&er->lock, flags);
- config = er->config;
- raw_spin_unlock_irqrestore(&er->lock, flags);
- return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx;
-
- idx = __BITS_VALUE(reg1->idx, 0, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
- nhmex_mbox_shared_reg_config(box, idx));
- idx = __BITS_VALUE(reg1->idx, 1, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
- nhmex_mbox_shared_reg_config(box, idx));
-
- if (reg2->idx != EXTRA_REG_NONE) {
- wrmsrl(reg2->reg, 0);
- if (reg2->config != ~0ULL) {
- wrmsrl(reg2->reg + 1,
- reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
- wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
- (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
- wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
- }
- }
-
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
- &format_attr_count_mode.attr,
- &format_attr_storage_mode.attr,
- &format_attr_wrap_mode.attr,
- &format_attr_flag_mode.attr,
- &format_attr_inc_sel.attr,
- &format_attr_set_flag_sel.attr,
- &format_attr_filter_cfg_en.attr,
- &format_attr_filter_match.attr,
- &format_attr_filter_mask.attr,
- &format_attr_dsp.attr,
- &format_attr_thr.attr,
- &format_attr_fvc.attr,
- &format_attr_pgt.attr,
- &format_attr_map.attr,
- &format_attr_iss.attr,
- &format_attr_pld.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_mbox_msr_enable_event,
- .hw_config = nhmex_mbox_hw_config,
- .get_constraint = nhmex_mbox_get_constraint,
- .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
- .name = "mbox",
- .num_counters = 6,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
- .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
- .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_M_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 8,
- .event_descs = nhmex_uncore_mbox_events,
- .ops = &nhmex_uncore_mbox_ops,
- .format_group = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- /* adjust the main event selector and extra register index */
- if (reg1->idx % 2) {
- reg1->idx--;
- hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- } else {
- reg1->idx++;
- hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- }
-
- /* adjust extra register config */
- switch (reg1->idx % 6) {
- case 2:
- /* shift the 8~15 bits to the 0~7 bits */
- reg1->config >>= 8;
- break;
- case 3:
- /* shift the 0~7 bits to the 8~15 bits */
- reg1->config <<= 8;
- break;
- };
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- int idx, er_idx;
- u64 config1;
- bool ok = false;
-
- if (!uncore_box_is_fake(box) && reg1->alloc)
- return NULL;
-
- idx = reg1->idx % 6;
- config1 = reg1->config;
-again:
- er_idx = idx;
- /* the 3rd and 4th events use the same extra register */
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (idx < 2) {
- if (!atomic_read(&er->ref) || er->config == reg1->config) {
- atomic_inc(&er->ref);
- er->config = reg1->config;
- ok = true;
- }
- } else if (idx == 2 || idx == 3) {
- /*
- * these two events use different fields in a extra register,
- * the 0~7 bits and the 8~15 bits respectively.
- */
- u64 mask = 0xff << ((idx - 2) * 8);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
- !((er->config ^ config1) & mask)) {
- atomic_add(1 << ((idx - 2) * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- } else {
- if (!atomic_read(&er->ref) ||
- (er->config == (hwc->config >> 32) &&
- er->config1 == reg1->config &&
- er->config2 == reg2->config)) {
- atomic_inc(&er->ref);
- er->config = (hwc->config >> 32);
- er->config1 = reg1->config;
- er->config2 = reg2->config;
- ok = true;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- /*
- * The Rbox events are always in pairs. The paired
- * events are functional identical, but use different
- * extra registers. If we failed to take an extra
- * register, try the alternative.
- */
- if (idx % 2)
- idx--;
- else
- idx++;
- if (idx != reg1->idx % 6) {
- if (idx == 2)
- config1 >>= 8;
- else if (idx == 3)
- config1 <<= 8;
- goto again;
- }
- } else {
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx % 6)
- nhmex_rbox_alter_er(box, event);
- reg1->alloc = 1;
- }
- return NULL;
- }
- return &constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_extra_reg *er;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- int idx, er_idx;
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- idx = reg1->idx % 6;
- er_idx = idx;
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- if (idx == 2 || idx == 3)
- atomic_sub(1 << ((idx - 2) * 8), &er->ref);
- else
- atomic_dec(&er->ref);
-
- reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int idx;
-
- idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- if (idx >= 0x18)
- return -EINVAL;
-
- reg1->idx = idx;
- reg1->config = event->attr.config1;
-
- switch (idx % 6) {
- case 4:
- case 5:
- hwc->config |= event->attr.config & (~0ULL << 32);
- reg2->config = event->attr.config2;
- break;
- };
- return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx, port;
-
- idx = reg1->idx;
- port = idx / 6 + box->pmu->pmu_idx * 4;
-
- switch (idx % 6) {
- case 0:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
- break;
- case 1:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
- break;
- case 2:
- case 3:
- wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
- uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
- break;
- case 4:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
- break;
- case 5:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
- break;
- };
-
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_xbr_mm_cfg.attr,
- &format_attr_xbr_match.attr,
- &format_attr_xbr_mask.attr,
- &format_attr_qlx_cfg.attr,
- &format_attr_iperf_cfg.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
- INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_rbox_msr_enable_event,
- .hw_config = nhmex_rbox_hw_config,
- .get_constraint = nhmex_rbox_get_constraint,
- .put_constraint = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
- .name = "rbox",
- .num_counters = 8,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_R_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
- .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_R_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 20,
- .event_descs = nhmex_uncore_rbox_events,
- .ops = &nhmex_uncore_rbox_ops,
- .format_group = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
- &nhmex_uncore_ubox,
- &nhmex_uncore_cbox,
- &nhmex_uncore_bbox,
- &nhmex_uncore_sbox,
- &nhmex_uncore_mbox,
- &nhmex_uncore_rbox,
- &nhmex_uncore_wbox,
- NULL,
-};
-/* end of Nehalem-EX uncore support */
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- hwc->idx = idx;
- hwc->last_tag = ++box->tags[idx];
-
- if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
- hwc->event_base = uncore_fixed_ctr(box);
- hwc->config_base = uncore_fixed_ctl(box);
- return;
- }
-
- hwc->config_base = uncore_event_ctl(box, hwc->idx);
- hwc->event_base = uncore_perf_ctr(box, hwc->idx);
-}
-
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
- u64 prev_count, new_count, delta;
- int shift;
-
- if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
- shift = 64 - uncore_fixed_ctr_bits(box);
- else
- shift = 64 - uncore_perf_ctr_bits(box);
-
- /* the hrtimer might modify the previous event value */
-again:
- prev_count = local64_read(&event->hw.prev_count);
- new_count = uncore_read_counter(box, event);
- if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
- goto again;
-
- delta = (new_count << shift) - (prev_count << shift);
- delta >>= shift;
-
- local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
- struct intel_uncore_box *box;
- unsigned long flags;
- int bit;
-
- box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
- if (!box->n_active || box->cpu != smp_processor_id())
- return HRTIMER_NORESTART;
- /*
- * disable local interrupt to prevent uncore_pmu_event_start/stop
- * to interrupt the update process
- */
- local_irq_save(flags);
-
- for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
- uncore_perf_event_update(box, box->events[bit]);
-
- local_irq_restore(flags);
-
- hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
- return HRTIMER_RESTART;
-}
-
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
- __hrtimer_start_range_ns(&box->hrtimer,
- ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
- HRTIMER_MODE_REL_PINNED, 0);
-}
-
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
- hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
- hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu)
-{
- struct intel_uncore_box *box;
- int i, size;
-
- size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
- box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
- if (!box)
- return NULL;
-
- for (i = 0; i < type->num_shared_regs; i++)
- raw_spin_lock_init(&box->shared_regs[i].lock);
-
- uncore_pmu_init_hrtimer(box);
- atomic_set(&box->refcnt, 1);
- box->cpu = -1;
- box->phys_id = -1;
-
- return box;
-}
-
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
- struct intel_uncore_box *box;
-
- box = *per_cpu_ptr(pmu->box, cpu);
- if (box)
- return box;
-
- raw_spin_lock(&uncore_box_lock);
- list_for_each_entry(box, &pmu->box_list, list) {
- if (box->phys_id == topology_physical_package_id(cpu)) {
- atomic_inc(&box->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = box;
- break;
- }
- }
- raw_spin_unlock(&uncore_box_lock);
-
- return *per_cpu_ptr(pmu->box, cpu);
-}
-
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
- return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
- /*
- * perf core schedules event on the basis of cpu, uncore events are
- * collected by one of the cpus inside a physical package.
- */
- return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
- struct perf_event *event;
- int n, max_count;
-
- max_count = box->pmu->type->num_counters;
- if (box->pmu->type->fixed_ctl)
- max_count++;
-
- if (box->n_events >= max_count)
- return -EINVAL;
-
- n = box->n_events;
- box->event_list[n] = leader;
- n++;
- if (!dogrp)
- return n;
-
- list_for_each_entry(event, &leader->sibling_list, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
- continue;
-
- if (n >= max_count)
- return -EINVAL;
-
- box->event_list[n] = event;
- n++;
- }
- return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_type *type = box->pmu->type;
- struct event_constraint *c;
-
- if (type->ops->get_constraint) {
- c = type->ops->get_constraint(box, event);
- if (c)
- return c;
- }
-
- if (event->hw.config == ~0ULL)
- return &constraint_fixed;
-
- if (type->constraints) {
- for_each_event_constraint(c, type->constraints) {
- if ((event->hw.config & c->cmask) == c->code)
- return c;
- }
- }
-
- return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- if (box->pmu->type->ops->put_constraint)
- box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
- unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
- struct event_constraint *c;
- int i, wmin, wmax, ret = 0;
- struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
- for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
- hwc = &box->event_list[i]->hw;
- c = uncore_get_event_constraint(box, box->event_list[i]);
- hwc->constraint = c;
- wmin = min(wmin, c->weight);
- wmax = max(wmax, c->weight);
- }
-
- /* fastpath, try to reuse previous register */
- for (i = 0; i < n; i++) {
- hwc = &box->event_list[i]->hw;
- c = hwc->constraint;
-
- /* never assigned */
- if (hwc->idx == -1)
- break;
-
- /* constraint still honored */
- if (!test_bit(hwc->idx, c->idxmsk))
- break;
-
- /* not already used */
- if (test_bit(hwc->idx, used_mask))
- break;
-
- __set_bit(hwc->idx, used_mask);
- if (assign)
- assign[i] = hwc->idx;
- }
- /* slow path */
- if (i != n)
- ret = perf_assign_events(box->event_list, n,
- wmin, wmax, assign);
-
- if (!assign || ret) {
- for (i = 0; i < n; i++)
- uncore_put_event_constraint(box, box->event_list[i]);
- }
- return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- int idx = event->hw.idx;
-
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
- return;
-
- if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
- return;
-
- event->hw.state = 0;
- box->events[idx] = event;
- box->n_active++;
- __set_bit(idx, box->active_mask);
-
- local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
- uncore_enable_event(box, event);
-
- if (box->n_active == 1) {
- uncore_enable_box(box);
- uncore_pmu_start_hrtimer(box);
- }
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
- uncore_disable_event(box, event);
- box->n_active--;
- box->events[hwc->idx] = NULL;
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
-
- if (box->n_active == 0) {
- uncore_disable_box(box);
- uncore_pmu_cancel_hrtimer(box);
- }
- }
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- uncore_perf_event_update(box, event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
- int assign[UNCORE_PMC_IDX_MAX];
- int i, n, ret;
-
- if (!box)
- return -ENODEV;
-
- ret = n = uncore_collect_events(box, event, false);
- if (ret < 0)
- return ret;
-
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- if (!(flags & PERF_EF_START))
- hwc->state |= PERF_HES_ARCH;
-
- ret = uncore_assign_events(box, assign, n);
- if (ret)
- return ret;
-
- /* save events moving to new counters */
- for (i = 0; i < box->n_events; i++) {
- event = box->event_list[i];
- hwc = &event->hw;
-
- if (hwc->idx == assign[i] &&
- hwc->last_tag == box->tags[assign[i]])
- continue;
- /*
- * Ensure we don't accidentally enable a stopped
- * counter simply because we rescheduled.
- */
- if (hwc->state & PERF_HES_STOPPED)
- hwc->state |= PERF_HES_ARCH;
-
- uncore_pmu_event_stop(event, PERF_EF_UPDATE);
- }
-
- /* reprogram moved events into new counters */
- for (i = 0; i < n; i++) {
- event = box->event_list[i];
- hwc = &event->hw;
-
- if (hwc->idx != assign[i] ||
- hwc->last_tag != box->tags[assign[i]])
- uncore_assign_hw_event(box, event, assign[i]);
- else if (i < box->n_events)
- continue;
-
- if (hwc->state & PERF_HES_ARCH)
- continue;
-
- uncore_pmu_event_start(event, 0);
- }
- box->n_events = n;
-
- return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- int i;
-
- uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
- for (i = 0; i < box->n_events; i++) {
- if (event == box->event_list[i]) {
- uncore_put_event_constraint(box, event);
-
- while (++i < box->n_events)
- box->event_list[i - 1] = box->event_list[i];
-
- --box->n_events;
- break;
- }
- }
-
- event->hw.idx = -1;
- event->hw.last_tag = ~0ULL;
-}
-
-static void uncore_pmu_event_read(struct perf_event *event)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
- struct perf_event *event)
-{
- struct perf_event *leader = event->group_leader;
- struct intel_uncore_box *fake_box;
- int ret = -EINVAL, n;
-
- fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
- if (!fake_box)
- return -ENOMEM;
-
- fake_box->pmu = pmu;
- /*
- * the event is not yet connected with its
- * siblings therefore we must first collect
- * existing siblings, then add the new event
- * before we can simulate the scheduling
- */
- n = uncore_collect_events(fake_box, leader, true);
- if (n < 0)
- goto out;
-
- fake_box->n_events = n;
- n = uncore_collect_events(fake_box, event, false);
- if (n < 0)
- goto out;
-
- fake_box->n_events = n;
-
- ret = uncore_assign_events(fake_box, NULL, n);
-out:
- kfree(fake_box);
- return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- struct hw_perf_event *hwc = &event->hw;
- int ret;
-
- if (event->attr.type != event->pmu->type)
- return -ENOENT;
-
- pmu = uncore_event_to_pmu(event);
- /* no device found for this pmu */
- if (pmu->func_id < 0)
- return -ENOENT;
-
- /*
- * Uncore PMU does measure at all privilege level all the time.
- * So it doesn't make sense to specify any exclude bits.
- */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_hv || event->attr.exclude_idle)
- return -EINVAL;
-
- /* Sampling not supported yet */
- if (hwc->sample_period)
- return -EINVAL;
-
- /*
- * Place all uncore events for a particular physical package
- * onto a single cpu
- */
- if (event->cpu < 0)
- return -EINVAL;
- box = uncore_pmu_to_box(pmu, event->cpu);
- if (!box || box->cpu < 0)
- return -EINVAL;
- event->cpu = box->cpu;
-
- event->hw.idx = -1;
- event->hw.last_tag = ~0ULL;
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
- event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
- if (event->attr.config == UNCORE_FIXED_EVENT) {
- /* no fixed counter */
- if (!pmu->type->fixed_ctl)
- return -EINVAL;
- /*
- * if there is only one fixed counter, only the first pmu
- * can access the fixed counter
- */
- if (pmu->type->single_fixed && pmu->pmu_idx > 0)
- return -EINVAL;
- hwc->config = ~0ULL;
- } else {
- hwc->config = event->attr.config & pmu->type->event_mask;
- if (pmu->type->ops->hw_config) {
- ret = pmu->type->ops->hw_config(box, event);
- if (ret)
- return ret;
- }
- }
-
- if (event->group_leader != event)
- ret = uncore_validate_group(pmu, event);
- else
- ret = 0;
-
- return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
-
- buf[n++] = '\n';
- buf[n] = '\0';
- return n;
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
- &dev_attr_cpumask.attr,
- NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
- .attrs = uncore_pmu_attrs,
-};
-
-static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
- int ret;
-
- pmu->pmu = (struct pmu) {
- .attr_groups = pmu->type->attr_groups,
- .task_ctx_nr = perf_invalid_context,
- .event_init = uncore_pmu_event_init,
- .add = uncore_pmu_event_add,
- .del = uncore_pmu_event_del,
- .start = uncore_pmu_event_start,
- .stop = uncore_pmu_event_stop,
- .read = uncore_pmu_event_read,
- };
-
- if (pmu->type->num_boxes == 1) {
- if (strlen(pmu->type->name) > 0)
- sprintf(pmu->name, "uncore_%s", pmu->type->name);
- else
- sprintf(pmu->name, "uncore");
- } else {
- sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
- pmu->pmu_idx);
- }
-
- ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
- return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
- int i;
-
- for (i = 0; i < type->num_boxes; i++)
- free_percpu(type->pmus[i].box);
- kfree(type->pmus);
- type->pmus = NULL;
- kfree(type->events_group);
- type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
- int i;
- for (i = 0; types[i]; i++)
- uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
- struct intel_uncore_pmu *pmus;
- struct attribute_group *attr_group;
- struct attribute **attrs;
- int i, j;
-
- pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
- if (!pmus)
- return -ENOMEM;
-
- type->unconstrainted = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
- 0, type->num_counters, 0, 0);
-
- for (i = 0; i < type->num_boxes; i++) {
- pmus[i].func_id = -1;
- pmus[i].pmu_idx = i;
- pmus[i].type = type;
- INIT_LIST_HEAD(&pmus[i].box_list);
- pmus[i].box = alloc_percpu(struct intel_uncore_box *);
- if (!pmus[i].box)
- goto fail;
- }
-
- if (type->event_descs) {
- i = 0;
- while (type->event_descs[i].attr.attr.name)
- i++;
-
- attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
- sizeof(*attr_group), GFP_KERNEL);
- if (!attr_group)
- goto fail;
-
- attrs = (struct attribute **)(attr_group + 1);
- attr_group->name = "events";
- attr_group->attrs = attrs;
-
- for (j = 0; j < i; j++)
- attrs[j] = &type->event_descs[j].attr.attr;
-
- type->events_group = attr_group;
- }
-
- type->pmu_group = &uncore_pmu_attr_group;
- type->pmus = pmus;
- return 0;
-fail:
- uncore_type_exit(type);
- return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
- int i, ret;
-
- for (i = 0; types[i]; i++) {
- ret = uncore_type_init(types[i]);
- if (ret)
- goto fail;
- }
- return 0;
-fail:
- while (--i >= 0)
- uncore_type_exit(types[i]);
- return ret;
-}
-
-static struct pci_driver *uncore_pci_driver;
-static bool pcidrv_registered;
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
-{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- int i, phys_id;
-
- phys_id = pcibus_to_physid[pdev->bus->number];
- if (phys_id < 0)
- return -ENODEV;
-
- box = uncore_alloc_box(type, 0);
- if (!box)
- return -ENOMEM;
-
- /*
- * for performance monitoring unit with multiple boxes,
- * each box has a different function id.
- */
- for (i = 0; i < type->num_boxes; i++) {
- pmu = &type->pmus[i];
- if (pmu->func_id == pdev->devfn)
- break;
- if (pmu->func_id < 0) {
- pmu->func_id = pdev->devfn;
- break;
- }
- pmu = NULL;
- }
-
- if (!pmu) {
- kfree(box);
- return -EINVAL;
- }
-
- box->phys_id = phys_id;
- box->pci_dev = pdev;
- box->pmu = pmu;
- uncore_box_init(box);
- pci_set_drvdata(pdev, box);
-
- raw_spin_lock(&uncore_box_lock);
- list_add_tail(&box->list, &pmu->box_list);
- raw_spin_unlock(&uncore_box_lock);
-
- return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
- struct intel_uncore_box *box = pci_get_drvdata(pdev);
- struct intel_uncore_pmu *pmu = box->pmu;
- int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
-
- if (WARN_ON_ONCE(phys_id != box->phys_id))
- return;
-
- pci_set_drvdata(pdev, NULL);
-
- raw_spin_lock(&uncore_box_lock);
- list_del(&box->list);
- raw_spin_unlock(&uncore_box_lock);
-
- for_each_possible_cpu(cpu) {
- if (*per_cpu_ptr(pmu->box, cpu) == box) {
- *per_cpu_ptr(pmu->box, cpu) = NULL;
- atomic_dec(&box->refcnt);
- }
- }
-
- WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
- kfree(box);
-}
-
-static int uncore_pci_probe(struct pci_dev *pdev,
- const struct pci_device_id *id)
-{
- return uncore_pci_add(pci_uncores[id->driver_data], pdev);
-}
-
-static int __init uncore_pci_init(void)
-{
- int ret;
-
- switch (boot_cpu_data.x86_model) {
- case 45: /* Sandy Bridge-EP */
- ret = snbep_pci2phy_map_init(0x3ce0);
- if (ret)
- return ret;
- pci_uncores = snbep_pci_uncores;
- uncore_pci_driver = &snbep_uncore_pci_driver;
- break;
- case 62: /* IvyTown */
- ret = snbep_pci2phy_map_init(0x0e1e);
- if (ret)
- return ret;
- pci_uncores = ivt_pci_uncores;
- uncore_pci_driver = &ivt_uncore_pci_driver;
- break;
- default:
- return 0;
- }
-
- ret = uncore_types_init(pci_uncores);
- if (ret)
- return ret;
-
- uncore_pci_driver->probe = uncore_pci_probe;
- uncore_pci_driver->remove = uncore_pci_remove;
-
- ret = pci_register_driver(uncore_pci_driver);
- if (ret == 0)
- pcidrv_registered = true;
- else
- uncore_types_exit(pci_uncores);
-
- return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
- if (pcidrv_registered) {
- pcidrv_registered = false;
- pci_unregister_driver(uncore_pci_driver);
- uncore_types_exit(pci_uncores);
- }
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void __cpuinit uncore_kfree_boxes(void)
-{
- struct intel_uncore_box *box;
-
- while (!list_empty(&boxes_to_free)) {
- box = list_entry(boxes_to_free.next,
- struct intel_uncore_box, list);
- list_del(&box->list);
- kfree(box);
- }
-}
-
-static void __cpuinit uncore_cpu_dying(int cpu)
-{
- struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- int i, j;
-
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- box = *per_cpu_ptr(pmu->box, cpu);
- *per_cpu_ptr(pmu->box, cpu) = NULL;
- if (box && atomic_dec_and_test(&box->refcnt))
- list_add(&box->list, &boxes_to_free);
- }
- }
-}
-
-static int __cpuinit uncore_cpu_starting(int cpu)
-{
- struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box, *exist;
- int i, j, k, phys_id;
-
- phys_id = topology_physical_package_id(cpu);
-
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- box = *per_cpu_ptr(pmu->box, cpu);
- /* called by uncore_cpu_init? */
- if (box && box->phys_id >= 0) {
- uncore_box_init(box);
- continue;
- }
-
- for_each_online_cpu(k) {
- exist = *per_cpu_ptr(pmu->box, k);
- if (exist && exist->phys_id == phys_id) {
- atomic_inc(&exist->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = exist;
- if (box) {
- list_add(&box->list,
- &boxes_to_free);
- box = NULL;
- }
- break;
- }
- }
-
- if (box) {
- box->phys_id = phys_id;
- uncore_box_init(box);
- }
- }
- }
- return 0;
-}
-
-static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
-{
- struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- int i, j;
-
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- if (pmu->func_id < 0)
- pmu->func_id = j;
-
- box = uncore_alloc_box(type, cpu);
- if (!box)
- return -ENOMEM;
-
- box->pmu = pmu;
- box->phys_id = phys_id;
- *per_cpu_ptr(pmu->box, cpu) = box;
- }
- }
- return 0;
-}
-
-static void __cpuinit
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
- struct intel_uncore_type *type;
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- int i, j;
-
- for (i = 0; uncores[i]; i++) {
- type = uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- if (old_cpu < 0)
- box = uncore_pmu_to_box(pmu, new_cpu);
- else
- box = uncore_pmu_to_box(pmu, old_cpu);
- if (!box)
- continue;
-
- if (old_cpu < 0) {
- WARN_ON_ONCE(box->cpu != -1);
- box->cpu = new_cpu;
- continue;
- }
-
- WARN_ON_ONCE(box->cpu != old_cpu);
- if (new_cpu >= 0) {
- uncore_pmu_cancel_hrtimer(box);
- perf_pmu_migrate_context(&pmu->pmu,
- old_cpu, new_cpu);
- box->cpu = new_cpu;
- } else {
- box->cpu = -1;
- }
- }
- }
-}
-
-static void __cpuinit uncore_event_exit_cpu(int cpu)
-{
- int i, phys_id, target;
-
- /* if exiting cpu is used for collecting uncore events */
- if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
- return;
-
- /* find a new cpu to collect uncore events */
- phys_id = topology_physical_package_id(cpu);
- target = -1;
- for_each_online_cpu(i) {
- if (i == cpu)
- continue;
- if (phys_id == topology_physical_package_id(i)) {
- target = i;
- break;
- }
- }
-
- /* migrate uncore events to the new cpu */
- if (target >= 0)
- cpumask_set_cpu(target, &uncore_cpu_mask);
-
- uncore_change_context(msr_uncores, cpu, target);
- uncore_change_context(pci_uncores, cpu, target);
-}
-
-static void __cpuinit uncore_event_init_cpu(int cpu)
-{
- int i, phys_id;
-
- phys_id = topology_physical_package_id(cpu);
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i))
- return;
- }
-
- cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
- uncore_change_context(msr_uncores, -1, cpu);
- uncore_change_context(pci_uncores, -1, cpu);
-}
-
-static int
- __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- /* allocate/free data structure for uncore box */
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- uncore_cpu_prepare(cpu, -1);
- break;
- case CPU_STARTING:
- uncore_cpu_starting(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_DYING:
- uncore_cpu_dying(cpu);
- break;
- case CPU_ONLINE:
- case CPU_DEAD:
- uncore_kfree_boxes();
- break;
- default:
- break;
- }
-
- /* select the cpu that collects uncore events */
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_STARTING:
- uncore_event_init_cpu(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uncore_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb __cpuinitdata = {
- .notifier_call = uncore_cpu_notifier,
- /*
- * to migrate uncore events, our notifier should be executed
- * before perf core's notifier.
- */
- .priority = CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
- uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
- int ret, cpu, max_cores;
-
- max_cores = boot_cpu_data.x86_max_cores;
- switch (boot_cpu_data.x86_model) {
- case 26: /* Nehalem */
- case 30:
- case 37: /* Westmere */
- case 44:
- msr_uncores = nhm_msr_uncores;
- break;
- case 42: /* Sandy Bridge */
- case 58: /* Ivy Bridge */
- if (snb_uncore_cbox.num_boxes > max_cores)
- snb_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snb_msr_uncores;
- break;
- case 45: /* Sandy Bridge-EP */
- if (snbep_uncore_cbox.num_boxes > max_cores)
- snbep_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snbep_msr_uncores;
- break;
- case 46: /* Nehalem-EX */
- uncore_nhmex = true;
- case 47: /* Westmere-EX aka. Xeon E7 */
- if (!uncore_nhmex)
- nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
- if (nhmex_uncore_cbox.num_boxes > max_cores)
- nhmex_uncore_cbox.num_boxes = max_cores;
- msr_uncores = nhmex_msr_uncores;
- break;
- case 62: /* IvyTown */
- if (ivt_uncore_cbox.num_boxes > max_cores)
- ivt_uncore_cbox.num_boxes = max_cores;
- msr_uncores = ivt_msr_uncores;
- break;
-
- default:
- return 0;
- }
-
- ret = uncore_types_init(msr_uncores);
- if (ret)
- return ret;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu) {
- int i, phys_id = topology_physical_package_id(cpu);
-
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i)) {
- phys_id = -1;
- break;
- }
- }
- if (phys_id < 0)
- continue;
-
- uncore_cpu_prepare(cpu, phys_id);
- uncore_event_init_cpu(cpu);
- }
- on_each_cpu(uncore_cpu_setup, NULL, 1);
-
- register_cpu_notifier(&uncore_cpu_nb);
-
- put_online_cpus();
-
- return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_type *type;
- int i, j;
-
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- uncore_pmu_register(pmu);
- }
- }
-
- for (i = 0; pci_uncores[i]; i++) {
- type = pci_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- uncore_pmu_register(pmu);
- }
- }
-
- return 0;
-}
-
-static int __init intel_uncore_init(void)
-{
- int ret;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
- return -ENODEV;
-
- if (cpu_has_hypervisor)
- return -ENODEV;
-
- ret = uncore_pci_init();
- if (ret)
- goto fail;
- ret = uncore_cpu_init();
- if (ret) {
- uncore_pci_exit();
- goto fail;
- }
-
- uncore_pmus_register();
- return 0;
-fail:
- return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
deleted file mode 100644
index 47b3d00c9d89..000000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,681 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN 32
-#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT 0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC 8
-#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET (1 << 18)
-#define SNB_UNC_CTL_EN (1 << 22)
-#define SNB_UNC_CTL_INVERT (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL 0x391
-#define SNB_UNC_FIXED_CTR_CTRL 0x394
-#define SNB_UNC_FIXED_CTR 0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
-#define SNB_UNC_CBO_0_PER_CTR0 0x706
-#define SNB_UNC_CBO_MSR_OFFSET 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL 0x391
-#define NHM_UNC_FIXED_CTR 0x394
-#define NHM_UNC_FIXED_CTR_CTRL 0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0 0x3c0
-#define NHM_UNC_UNCORE_PMC0 0x3b0
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS | \
- SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
-#define SNBEP_PMON_CTL_RST (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
-#define SNBEP_PMON_CTL_EN (1 << 22)
-#define SNBEP_PMON_CTL_INVERT (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL 0xf4
-#define SNBEP_PCI_PMON_CTL0 0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0 0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0 0xc16
-#define SNBEP_U_MSR_PMON_CTL0 0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
-#define SNBEP_CBO_MSR_OFFSET 0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
- .event = (e), \
- .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
- .config_mask = (m), \
- .idx = (i) \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
-
-/* IVT event control */
-#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_TRESH_MASK)
-/* IVT Ubox */
-#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
-
-#define IVT_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVT Cbo */
-#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63)
-
-/* IVT home agent */
-#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
-#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- IVT_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVT PCU */
-#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVT QPI */
-#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
-#define NHMEX_PMON_CTL_INVERT (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_UMASK_MASK | \
- NHMEX_PMON_CTL_EDGE_DET | \
- NHMEX_PMON_CTL_INVERT | \
- NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define NHMEX_U_MSR_PMON_CTR 0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK \
- (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
-#define NHMEX_C0_MSR_PMON_CTR0 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
-#define NHMEX_C_MSR_OFFSET 0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
-#define NHMEX_B0_MSR_PMON_CTR0 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0 0xc30
-#define NHMEX_B_MSR_OFFSET 0x40
-#define NHMEX_B0_MSR_MATCH 0xe45
-#define NHMEX_B0_MSR_MASK 0xe46
-#define NHMEX_B1_MSR_MATCH 0xe4d
-#define NHMEX_B1_MSR_MASK 0xe4e
-
-#define NHMEX_B_PMON_CTL_EN (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT 6
-#define NHMEX_B_PMON_CTR_MASK \
- (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK \
- (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
- NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
-#define NHMEX_S0_MSR_PMON_CTR0 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0 0xc50
-#define NHMEX_S_MSR_OFFSET 0x80
-#define NHMEX_S0_MSR_MM_CFG 0xe48
-#define NHMEX_S0_MSR_MATCH 0xe49
-#define NHMEX_S0_MSR_MASK 0xe4a
-#define NHMEX_S1_MSR_MM_CFG 0xe58
-#define NHMEX_S1_MSR_MATCH 0xe59
-#define NHMEX_S1_MSR_MASK 0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV 0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
-#define NHMEX_M0_MSR_PMU_DSP 0xca5
-#define NHMEX_M0_MSR_PMU_ISS 0xca6
-#define NHMEX_M0_MSR_PMU_MAP 0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
-#define NHMEX_M0_MSR_PMU_PGT 0xca9
-#define NHMEX_M0_MSR_PMU_PLD 0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
-#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
-#define NHMEX_M_MSR_OFFSET 0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
-
-#define NHMEX_M_PMON_CTL_EN (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
- (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
- (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK \
- (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
- NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
- NHMEX_M_PMON_CTL_WRAP_MODE | \
- NHMEX_M_PMON_CTL_FLAG_MODE | \
- NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_SET_FLAG_SEL_MASK, \
- (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
-#define NHMEX_R_MSR_PMON_CTL0 0xe10
-#define NHMEX_R_MSR_PMON_CNT0 0xe11
-#define NHMEX_R_MSR_OFFSET 0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
- ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
- (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
- (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
- (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
-#define NHMEX_W_MSR_PMON_CNT0 0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
- const char *name;
- int num_counters;
- int num_boxes;
- int perf_ctr_bits;
- int fixed_ctr_bits;
- unsigned perf_ctr;
- unsigned event_ctl;
- unsigned event_mask;
- unsigned fixed_ctr;
- unsigned fixed_ctl;
- unsigned box_ctl;
- unsigned msr_offset;
- unsigned num_shared_regs:8;
- unsigned single_fixed:1;
- unsigned pair_ctr_ctl:1;
- unsigned *msr_offsets;
- struct event_constraint unconstrainted;
- struct event_constraint *constraints;
- struct intel_uncore_pmu *pmus;
- struct intel_uncore_ops *ops;
- struct uncore_event_desc *event_descs;
- const struct attribute_group *attr_groups[4];
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
- void (*init_box)(struct intel_uncore_box *);
- void (*disable_box)(struct intel_uncore_box *);
- void (*enable_box)(struct intel_uncore_box *);
- void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
- void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
- u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
- int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
- struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
- struct perf_event *);
- void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
- struct pmu pmu;
- char name[UNCORE_PMU_NAME_LEN];
- int pmu_idx;
- int func_id;
- struct intel_uncore_type *type;
- struct intel_uncore_box ** __percpu box;
- struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
- raw_spinlock_t lock;
- u64 config, config1, config2;
- atomic_t ref;
-};
-
-struct intel_uncore_box {
- int phys_id;
- int n_active; /* number of active events */
- int n_events;
- int cpu; /* cpu to collect events */
- unsigned long flags;
- atomic_t refcnt;
- struct perf_event *events[UNCORE_PMC_IDX_MAX];
- struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
- unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
- u64 tags[UNCORE_PMC_IDX_MAX];
- struct pci_dev *pci_dev;
- struct intel_uncore_pmu *pmu;
- struct hrtimer hrtimer;
- struct list_head list;
- struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED 0
-
-struct uncore_event_desc {
- struct kobj_attribute attr;
- const char *config;
-};
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
-{ \
- .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
- .config = _config, \
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
- char *page) \
-{ \
- BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
- return sprintf(page, _format "\n"); \
-} \
-static struct kobj_attribute format_attr_##_var = \
- __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-
-static ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct uncore_event_desc *event =
- container_of(attr, struct uncore_event_desc, attr);
- return sprintf(buf, "%s", event->config);
-}
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
- return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
- return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
- return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
- return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
- return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
- struct intel_uncore_pmu *pmu = box->pmu;
- return pmu->type->msr_offsets ?
- pmu->type->msr_offsets[pmu->pmu_idx] :
- pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
- if (!box->pmu->type->box_ctl)
- return 0;
- return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
- if (!box->pmu->type->fixed_ctl)
- return 0;
- return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
- return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
- return box->pmu->type->event_ctl +
- (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
- uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
- return box->pmu->type->perf_ctr +
- (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
- uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
- if (box->pci_dev)
- return uncore_pci_fixed_ctl(box);
- else
- return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
- if (box->pci_dev)
- return uncore_pci_fixed_ctr(box);
- else
- return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
- if (box->pci_dev)
- return uncore_pci_event_ctl(box, idx);
- else
- return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
- if (box->pci_dev)
- return uncore_pci_perf_ctr(box, idx);
- else
- return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
- return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
- return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
- return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
- if (box->pmu->type->ops->disable_box)
- box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
- if (box->pmu->type->ops->enable_box)
- box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
- if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
- if (box->pmu->type->ops->init_box)
- box->pmu->type->ops->init_box(box);
- }
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
- return (box->phys_id < 0);
-}
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
deleted file mode 100644
index 838fa8772c62..000000000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
- [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- /* On Xeon Phi event "0" is a valid DATA_READ */
- /* (L1 Data Cache Reads) Instruction. */
- /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
- /* bit will always be set in x86_pmu_hw_config(). */
- [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
- /* DATA_READ */
- [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
- [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
- [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
- [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
- [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
- /* DATA_READ */
- /* see note on L1 OP_READ */
- [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
- [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = 0x0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
- [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
- [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
- return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
- INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
- INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
- INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
- INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
- INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
- INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
- INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
- INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
- INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
- INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
- INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
- INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
- INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
- INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
- INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
- INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
- INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
- INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
- INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
- INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
- EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
-
-#define KNC_ENABLE_COUNTER0 0x00000001
-#define KNC_ENABLE_COUNTER1 0x00000002
-
-static void knc_pmu_disable_all(void)
-{
- u64 val;
-
- rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
- val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
- wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
- u64 val;
-
- rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
- val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
- wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 val;
-
- val = hwc->config;
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
- (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 val;
-
- val = hwc->config;
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
- (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
- u64 status;
-
- rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
- return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
- wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- int handled = 0;
- int bit, loops;
- u64 status;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- knc_pmu_disable_all();
-
- status = knc_pmu_get_status();
- if (!status) {
- knc_pmu_enable_all(0);
- return handled;
- }
-
- loops = 0;
-again:
- knc_pmu_ack_status(status);
- if (++loops > 100) {
- WARN_ONCE(1, "perf: irq loop stuck!\n");
- perf_event_print_debug();
- goto done;
- }
-
- inc_irq_stat(apic_perf_irqs);
-
- for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
- struct perf_event *event = cpuc->events[bit];
-
- handled++;
-
- if (!test_bit(bit, cpuc->active_mask))
- continue;
-
- if (!intel_pmu_save_and_restart(event))
- continue;
-
- perf_sample_data_init(&data, 0, event->hw.last_period);
-
- if (perf_event_overflow(event, &data, regs))
- x86_pmu_stop(event, 0);
- }
-
- /*
- * Repeat if there is more work to be done:
- */
- status = knc_pmu_get_status();
- if (status)
- goto again;
-
-done:
- knc_pmu_enable_all(0);
-
- return handled;
-}
-
-
-PMU_FORMAT_ATTR(event, "config:0-7" );
-PMU_FORMAT_ATTR(umask, "config:8-15" );
-PMU_FORMAT_ATTR(edge, "config:18" );
-PMU_FORMAT_ATTR(inv, "config:23" );
-PMU_FORMAT_ATTR(cmask, "config:24-31" );
-
-static struct attribute *intel_knc_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask.attr,
- NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
- .name = "knc",
- .handle_irq = knc_pmu_handle_irq,
- .disable_all = knc_pmu_disable_all,
- .enable_all = knc_pmu_enable_all,
- .enable = knc_pmu_enable_event,
- .disable = knc_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_KNC_EVNTSEL0,
- .perfctr = MSR_KNC_PERFCTR0,
- .event_map = knc_pmu_event_map,
- .max_events = ARRAY_SIZE(knc_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 39) - 1,
- .version = 0,
- .num_counters = 2,
- .cntval_bits = 40,
- .cntval_mask = (1ULL << 40) - 1,
- .get_event_constraints = x86_get_event_constraints,
- .event_constraints = knc_event_constraints,
- .format_attrs = intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
- x86_pmu = knc_pmu;
-
- memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
deleted file mode 100644
index 3486e6660357..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,1344 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- * For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
- unsigned int opcode; /* Event code and ESCR selector */
- unsigned int escr_msr[2]; /* ESCR MSR for this event */
- unsigned int escr_emask; /* valid ESCR EventMask bits */
- unsigned int shared; /* event is shared across threads */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
- unsigned int metric_pebs;
- unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert) \
- [P4_PEBS_METRIC__##name] = { \
- .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
- .metric_vert = vert, \
- }
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
- P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
- P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
- P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
- P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
- P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
- P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
- P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
- P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
- P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
- [P4_EVENT_TC_DELIVER_MODE] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
- .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
- .shared = 1,
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_BPU_FETCH_REQUEST] = {
- .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
- .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_ITLB_REFERENCE] = {
- .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
- .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_MEMORY_CANCEL] = {
- .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
- .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_MEMORY_COMPLETE] = {
- .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
- .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_LOAD_PORT_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
- .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_STORE_PORT_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
- .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_MOB_LOAD_REPLAY] = {
- .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
- .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_PAGE_WALK_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
- .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
- .shared = 1,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BSQ_CACHE_REFERENCE] = {
- .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
- .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_IOQ_ALLOCATION] = {
- .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
- .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
- .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
- P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
- .cntr = { {2, -1, -1}, {3, -1, -1} },
- },
- [P4_EVENT_FSB_DATA_ACTIVITY] = {
- .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
- .shared = 1,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
- .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
- .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
- .cntr = { {0, -1, -1}, {1, -1, -1} },
- },
- [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
- .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
- .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
- .cntr = { {2, -1, -1}, {3, -1, -1} },
- },
- [P4_EVENT_SSE_INPUT_ASSIST] = {
- .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_PACKED_SP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_PACKED_DP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_SCALAR_SP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_SCALAR_DP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_64BIT_MMX_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_128BIT_MMX_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_X87_FP_UOP] = {
- .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
- .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_TC_MISC] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
- .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_GLOBAL_POWER_EVENTS] = {
- .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_TC_MS_XFER] = {
- .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
- .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_UOP_QUEUE_WRITES] = {
- .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
- .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
- P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
- P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
- .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RETIRED_BRANCH_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
- .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
- .cntr = { {4, 5, -1}, {6, 7, -1} },
- },
- [P4_EVENT_RESOURCE_STALL] = {
- .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
- .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_WC_BUFFER] = {
- .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
- .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
- .shared = 1,
- .cntr = { {8, 9, -1}, {10, 11, -1} },
- },
- [P4_EVENT_B2B_CYCLES] = {
- .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask = 0,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_BNR] = {
- .opcode = P4_OPCODE(P4_EVENT_BNR),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask = 0,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_SNOOP] = {
- .opcode = P4_OPCODE(P4_EVENT_SNOOP),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask = 0,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_RESPONSE] = {
- .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
- .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
- .escr_emask = 0,
- .cntr = { {0, -1, -1}, {2, -1, -1} },
- },
- [P4_EVENT_FRONT_END_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_EXECUTION_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_REPLAY_EVENT] = {
- .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_INSTR_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_UOPS_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_UOP_TYPE] = {
- .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
- .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_BRANCH_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
- .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_X87_ASSIST] = {
- .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
- P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_MACHINE_CLEAR] = {
- .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
- .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
- [P4_EVENT_INSTR_COMPLETED] = {
- .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
- .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
- .escr_emask =
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
- .cntr = { {12, 13, 16}, {14, 15, 17} },
- },
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric) \
- p4_config_pack_escr(P4_ESCR_EVENT(event) | \
- P4_ESCR_EMASK_BIT(event, bit)) | \
- p4_config_pack_cccr(metric | \
- P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_PEBS_METRIC__1stl_cache_load_miss_retired),
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
- },
-},
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_PEBS_METRIC__dtlb_load_miss_retired),
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0,
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
- P4_PEBS_METRIC__dtlb_store_miss_retired),
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
- P4_PEBS_METRIC__none),
- [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
- P4_PEBS_METRIC__none),
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(NODE) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
- u64 original;
- u64 alternative;
-} p4_event_aliases[] = {
- {
- /*
- * Non-halted cycles can be substituted with non-sleeping cycles (see
- * Intel SDM Vol3b for details). We need this alias to be able
- * to run nmi-watchdog and 'perf top' (or any other user space tool
- * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
- * simultaneously.
- */
- .original =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
- .alternative =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
- P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
- p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT |
- P4_CCCR_COMPARE),
- },
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
- u64 config_match;
- int i;
-
- /*
- * Only event with special mark is allowed,
- * we're to be sure it didn't come as malformed
- * RAW event.
- */
- if (!(config & P4_CONFIG_ALIASABLE))
- return 0;
-
- config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
- for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
- if (config_match == p4_event_aliases[i].original) {
- config_match = p4_event_aliases[i].alternative;
- break;
- } else if (config_match == p4_event_aliases[i].alternative) {
- config_match = p4_event_aliases[i].original;
- break;
- }
- }
-
- if (i >= ARRAY_SIZE(p4_event_aliases))
- return 0;
-
- return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
- /* non-halted CPU clocks */
- [PERF_COUNT_HW_CPU_CYCLES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) |
- P4_CONFIG_ALIASABLE,
-
- /*
- * retired instructions
- * in a sake of simplicity we don't use the FSB tagging
- */
- [PERF_COUNT_HW_INSTRUCTIONS] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
- P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
- /* cache hits */
- [PERF_COUNT_HW_CACHE_REFERENCES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
- /* cache misses */
- [PERF_COUNT_HW_CACHE_MISSES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
- P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
- /* branch instructions retired */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
- P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
- /* mispredicted branches retired */
- [PERF_COUNT_HW_BRANCH_MISSES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
- P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
- /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
- [PERF_COUNT_HW_BUS_CYCLES] =
- p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
- P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
- p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
- unsigned int evnt = p4_config_unpack_event(config);
- struct p4_event_bind *bind = NULL;
-
- if (evnt < ARRAY_SIZE(p4_event_bind_map))
- bind = &p4_event_bind_map[evnt];
-
- return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
- struct p4_event_bind *bind;
- unsigned int esel;
- u64 config;
-
- config = p4_general_events[hw_event];
- bind = p4_config_get_bind(config);
- esel = P4_OPCODE_ESEL(bind->opcode);
- config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
- return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
- /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
- if (event_idx == P4_EVENT_INSTR_COMPLETED) {
- if (boot_cpu_data.x86_model != 3 &&
- boot_cpu_data.x86_model != 4 &&
- boot_cpu_data.x86_model != 6)
- return false;
- }
-
- /*
- * For info
- * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
- */
-
- return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
- unsigned int v, emask;
-
- /* User data may have out-of-bound event index */
- v = p4_config_unpack_event(event->attr.config);
- if (v >= ARRAY_SIZE(p4_event_bind_map))
- return -EINVAL;
-
- /* It may be unsupported: */
- if (!p4_event_match_cpu_model(v))
- return -EINVAL;
-
- /*
- * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
- * in Architectural Performance Monitoring, it means not
- * on _which_ logical cpu to count but rather _when_, ie it
- * depends on logical cpu state -- count event if one cpu active,
- * none, both or any, so we just allow user to pass any value
- * desired.
- *
- * In turn we always set Tx_OS/Tx_USR bits bound to logical
- * cpu without their propagation to another cpu
- */
-
- /*
- * if an event is shared across the logical threads
- * the user needs special permissions to be able to use it
- */
- if (p4_ht_active() && p4_event_bind_map[v].shared) {
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
- }
-
- /* ESCR EventMask bits may be invalid */
- emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
- if (emask & ~p4_event_bind_map[v].escr_emask)
- return -EINVAL;
-
- /*
- * it may have some invalid PEBS bits
- */
- if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
- return -EINVAL;
-
- v = p4_config_unpack_metric(event->attr.config);
- if (v >= ARRAY_SIZE(p4_pebs_bind_map))
- return -EINVAL;
-
- return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
- int cpu = get_cpu();
- int rc = 0;
- u32 escr, cccr;
-
- /*
- * the reason we use cpu that early is that: if we get scheduled
- * first time on the same cpu -- we will not need swap thread
- * specific flags in config (and will save some cpu cycles)
- */
-
- cccr = p4_default_cccr_conf(cpu);
- escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
- event->attr.exclude_user);
- event->hw.config = p4_config_pack_escr(escr) |
- p4_config_pack_cccr(cccr);
-
- if (p4_ht_active() && p4_ht_thread(cpu))
- event->hw.config = p4_set_ht_bit(event->hw.config);
-
- if (event->attr.type == PERF_TYPE_RAW) {
- struct p4_event_bind *bind;
- unsigned int esel;
- /*
- * Clear bits we reserve to be managed by kernel itself
- * and never allowed from a user space
- */
- event->attr.config &= P4_CONFIG_MASK;
-
- rc = p4_validate_raw_event(event);
- if (rc)
- goto out;
-
- /*
- * Note that for RAW events we allow user to use P4_CCCR_RESERVED
- * bits since we keep additional info here (for cache events and etc)
- */
- event->hw.config |= event->attr.config;
- bind = p4_config_get_bind(event->attr.config);
- if (!bind) {
- rc = -EINVAL;
- goto out;
- }
- esel = P4_OPCODE_ESEL(bind->opcode);
- event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
- }
-
- rc = x86_setup_perfctr(event);
-out:
- put_cpu();
- return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
- u64 v;
-
- /* an official way for overflow indication */
- rdmsrl(hwc->config_base, v);
- if (v & P4_CCCR_OVF) {
- wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
- return 1;
- }
-
- /*
- * In some circumstances the overflow might issue an NMI but did
- * not set P4_CCCR_OVF bit. Because a counter holds a negative value
- * we simply check for high bit being set, if it's cleared it means
- * the counter has reached zero value and continued counting before
- * real NMI signal was received:
- */
- rdmsrl(hwc->event_base, v);
- if (!(v & ARCH_P4_UNFLAGGED_BIT))
- return 1;
-
- return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
- /*
- * FIXME
- *
- * It's still allowed that two threads setup same cache
- * events so we can't simply clear metrics until we knew
- * no one is depending on us, so we need kind of counter
- * for "ReplayEvent" users.
- *
- * What is more complex -- RAW events, if user (for some
- * reason) will pass some cache event metric with improper
- * event opcode -- it's fine from hardware point of view
- * but completely nonsense from "meaning" of such action.
- *
- * So at moment let leave metrics turned on forever -- it's
- * ok for now but need to be revisited!
- *
- * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
- * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
- */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- /*
- * If event gets disabled while counter is in overflowed
- * state we need to clear P4_CCCR_OVF, otherwise interrupt get
- * asserted again and again
- */
- (void)wrmsrl_safe(hwc->config_base,
- p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- p4_pmu_disable_event(event);
- }
-
- p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
- struct p4_pebs_bind *bind;
- unsigned int idx;
-
- BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
- idx = p4_config_unpack_metric(config);
- if (idx == P4_PEBS_METRIC__none)
- return;
-
- bind = &p4_pebs_bind_map[idx];
-
- (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
- (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- int thread = p4_ht_config_thread(hwc->config);
- u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
- unsigned int idx = p4_config_unpack_event(hwc->config);
- struct p4_event_bind *bind;
- u64 escr_addr, cccr;
-
- bind = &p4_event_bind_map[idx];
- escr_addr = bind->escr_msr[thread];
-
- /*
- * - we dont support cascaded counters yet
- * - and counter 1 is broken (erratum)
- */
- WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
- WARN_ON_ONCE(hwc->idx == 1);
-
- /* we need a real Event value */
- escr_conf &= ~P4_ESCR_EVENT_MASK;
- escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
- cccr = p4_config_unpack_cccr(hwc->config);
-
- /*
- * it could be Cache event so we need to write metrics
- * into additional MSRs
- */
- p4_pmu_enable_pebs(hwc->config);
-
- (void)wrmsrl_safe(escr_addr, escr_conf);
- (void)wrmsrl_safe(hwc->config_base,
- (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- int idx;
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- struct perf_event *event = cpuc->events[idx];
- if (!test_bit(idx, cpuc->active_mask))
- continue;
- p4_pmu_enable_event(event);
- }
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
- struct perf_sample_data data;
- struct cpu_hw_events *cpuc;
- struct perf_event *event;
- struct hw_perf_event *hwc;
- int idx, handled = 0;
- u64 val;
-
- cpuc = &__get_cpu_var(cpu_hw_events);
-
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
- int overflow;
-
- if (!test_bit(idx, cpuc->active_mask)) {
- /* catch in-flight IRQs */
- if (__test_and_clear_bit(idx, cpuc->running))
- handled++;
- continue;
- }
-
- event = cpuc->events[idx];
- hwc = &event->hw;
-
- WARN_ON_ONCE(hwc->idx != idx);
-
- /* it might be unflagged overflow */
- overflow = p4_pmu_clear_cccr_ovf(hwc);
-
- val = x86_perf_event_update(event);
- if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
- continue;
-
- handled += overflow;
-
- /* event overflow for sure */
- perf_sample_data_init(&data, 0, hwc->last_period);
-
- if (!x86_perf_event_set_period(event))
- continue;
-
-
- if (perf_event_overflow(event, &data, regs))
- x86_pmu_stop(event, 0);
- }
-
- if (handled)
- inc_irq_stat(apic_perf_irqs);
-
- /*
- * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
- * been observed that the OVF bit flag has to be cleared first _before_
- * the LVTPC can be unmasked.
- *
- * The reason is the NMI line will continue to be asserted while the OVF
- * bit is set. This causes a second NMI to generate if the LVTPC is
- * unmasked before the OVF bit is cleared, leading to unknown NMI
- * messages.
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
- u32 escr, cccr;
-
- /*
- * we either lucky and continue on same cpu or no HT support
- */
- if (!p4_should_swap_ts(hwc->config, cpu))
- return;
-
- /*
- * the event is migrated from an another logical
- * cpu, so we need to swap thread specific flags
- */
-
- escr = p4_config_unpack_escr(hwc->config);
- cccr = p4_config_unpack_cccr(hwc->config);
-
- if (p4_ht_thread(cpu)) {
- cccr &= ~P4_CCCR_OVF_PMI_T0;
- cccr |= P4_CCCR_OVF_PMI_T1;
- if (escr & P4_ESCR_T0_OS) {
- escr &= ~P4_ESCR_T0_OS;
- escr |= P4_ESCR_T1_OS;
- }
- if (escr & P4_ESCR_T0_USR) {
- escr &= ~P4_ESCR_T0_USR;
- escr |= P4_ESCR_T1_USR;
- }
- hwc->config = p4_config_pack_escr(escr);
- hwc->config |= p4_config_pack_cccr(cccr);
- hwc->config |= P4_CONFIG_HT;
- } else {
- cccr &= ~P4_CCCR_OVF_PMI_T1;
- cccr |= P4_CCCR_OVF_PMI_T0;
- if (escr & P4_ESCR_T1_OS) {
- escr &= ~P4_ESCR_T1_OS;
- escr |= P4_ESCR_T0_OS;
- }
- if (escr & P4_ESCR_T1_USR) {
- escr &= ~P4_ESCR_T1_USR;
- escr |= P4_ESCR_T0_USR;
- }
- hwc->config = p4_config_pack_escr(escr);
- hwc->config |= p4_config_pack_cccr(cccr);
- hwc->config &= ~P4_CONFIG_HT;
- }
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE 0x000003a0
-#define P4_ESCR_MSR_MAX 0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
- P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
- unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
- if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
- !p4_escr_table[idx] ||
- p4_escr_table[idx] != addr)) {
- WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
- return -1;
- }
-
- return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
- struct p4_event_bind *bind)
-{
- int i, j;
-
- for (i = 0; i < P4_CNTR_LIMIT; i++) {
- j = bind->cntr[thread][i];
- if (j != -1 && !test_bit(j, used_mask))
- return j;
- }
-
- return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
- int cpu = smp_processor_id();
- struct hw_perf_event *hwc;
- struct p4_event_bind *bind;
- unsigned int i, thread, num;
- int cntr_idx, escr_idx;
- u64 config_alias;
- int pass;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
- for (i = 0, num = n; i < n; i++, num--) {
-
- hwc = &cpuc->event_list[i]->hw;
- thread = p4_ht_thread(cpu);
- pass = 0;
-
-again:
- /*
- * It's possible to hit a circular lock
- * between original and alternative events
- * if both are scheduled already.
- */
- if (pass > 2)
- goto done;
-
- bind = p4_config_get_bind(hwc->config);
- escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
- if (unlikely(escr_idx == -1))
- goto done;
-
- if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
- cntr_idx = hwc->idx;
- if (assign)
- assign[i] = hwc->idx;
- goto reserve;
- }
-
- cntr_idx = p4_next_cntr(thread, used_mask, bind);
- if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
- /*
- * Check whether an event alias is still available.
- */
- config_alias = p4_get_alias_event(hwc->config);
- if (!config_alias)
- goto done;
- hwc->config = config_alias;
- pass++;
- goto again;
- }
-
- p4_pmu_swap_config_ts(hwc, cpu);
- if (assign)
- assign[i] = cntr_idx;
-reserve:
- set_bit(cntr_idx, used_mask);
- set_bit(escr_idx, escr_mask);
- }
-
-done:
- return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht, "config:63" );
-
-static struct attribute *intel_p4_formats_attr[] = {
- &format_attr_cccr.attr,
- &format_attr_escr.attr,
- &format_attr_ht.attr,
- NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
- .name = "Netburst P4/Xeon",
- .handle_irq = p4_pmu_handle_irq,
- .disable_all = p4_pmu_disable_all,
- .enable_all = p4_pmu_enable_all,
- .enable = p4_pmu_enable_event,
- .disable = p4_pmu_disable_event,
- .eventsel = MSR_P4_BPU_CCCR0,
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .event_map = p4_pmu_event_map,
- .max_events = ARRAY_SIZE(p4_general_events),
- .get_event_constraints = x86_get_event_constraints,
- /*
- * IF HT disabled we may need to use all
- * ARCH_P4_MAX_CCCR counters simulaneously
- * though leave it restricted at moment assuming
- * HT is on
- */
- .num_counters = ARCH_P4_MAX_CCCR,
- .apic = 1,
- .cntval_bits = ARCH_P4_CNTRVAL_BITS,
- .cntval_mask = ARCH_P4_CNTRVAL_MASK,
- .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
- .hw_config = p4_hw_config,
- .schedule_events = p4_pmu_schedule_events,
- /*
- * This handles erratum N15 in intel doc 249199-029,
- * the counter may not be updated correctly on write
- * so we need a second write operation to do the trick
- * (the official workaround didn't work)
- *
- * the former idea is taken from OProfile code
- */
- .perfctr_second_write = 1,
-
- .format_attrs = intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
- unsigned int low, high;
-
- /* If we get stripped -- indexing fails */
- BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
- rdmsr(MSR_IA32_MISC_ENABLE, low, high);
- if (!(low & (1 << 7))) {
- pr_cont("unsupported Netburst CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
- pr_cont("Netburst events, ");
-
- x86_pmu = p4_pmu;
-
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
deleted file mode 100644
index b1e2fe115323..000000000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
- [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */
- [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */
- [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */
- [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */
- [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */
- [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
- [PERF_COUNT_HW_CACHE_MAX]
- [PERF_COUNT_HW_CACHE_OP_MAX]
- [PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
- [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(L1I ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
- [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(LL ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(DTLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = 0,
- [ C(RESULT_MISS) ] = 0,
- },
- },
- [ C(ITLB) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
- [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
- [ C(BPU ) ] = {
- [ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
- [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
- },
- [ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- [ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = -1,
- [ C(RESULT_MISS) ] = -1,
- },
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
- return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT 0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
- INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
- INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
- INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
- INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
- INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
- INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
- EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
- u64 val;
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
- unsigned long val;
-
- /* p6 only has one enable register */
- rdmsrl(MSR_P6_EVNTSEL0, val);
- val |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 val = P6_NOP_EVENT;
-
- (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- u64 val;
-
- val = hwc->config;
-
- /*
- * p6 only has a global event enable, set on PerfEvtSel0
- * We "disable" events by programming P6_NOP_EVENT
- * and we rely on p6_pmu_enable_all() being called
- * to actually enable the events.
- */
-
- (void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event, "config:0-7" );
-PMU_FORMAT_ATTR(umask, "config:8-15" );
-PMU_FORMAT_ATTR(edge, "config:18" );
-PMU_FORMAT_ATTR(pc, "config:19" );
-PMU_FORMAT_ATTR(inv, "config:23" );
-PMU_FORMAT_ATTR(cmask, "config:24-31" );
-
-static struct attribute *intel_p6_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_pc.attr,
- &format_attr_inv.attr,
- &format_attr_cmask.attr,
- NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
- .name = "p6",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = p6_pmu_disable_all,
- .enable_all = p6_pmu_enable_all,
- .enable = p6_pmu_enable_event,
- .disable = p6_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_P6_EVNTSEL0,
- .perfctr = MSR_P6_PERFCTR0,
- .event_map = p6_pmu_event_map,
- .max_events = ARRAY_SIZE(p6_perfmon_event_map),
- .apic = 1,
- .max_period = (1ULL << 31) - 1,
- .version = 0,
- .num_counters = 2,
- /*
- * Events have 40 bits implemented. However they are designed such
- * that bits [32-39] are sign extensions of bit 31. As such the
- * effective width of a event for P6-like PMU is 32 bits only.
- *
- * See IA-32 Intel Architecture Software developer manual Vol 3B
- */
- .cntval_bits = 32,
- .cntval_mask = (1ULL << 32) - 1,
- .get_event_constraints = x86_get_event_constraints,
- .event_constraints = p6_event_constraints,
-
- .format_attrs = intel_p6_formats_attr,
- .events_sysfs_show = intel_event_sysfs_show,
-
-};
-
-__init int p6_pmu_init(void)
-{
- switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- case 9:
- case 13:
- /* Pentium M */
- break;
- default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
- return -ENODEV;
- }
-
- x86_pmu = p6_pmu;
-
- memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
- sizeof(hw_cache_event_ids));
-
-
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 2e8caf03f593..7aecb2fc3186 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
- * with other users (like oprofile).
+ * with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -12,7 +13,7 @@
*/
#include <linux/percpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
@@ -45,6 +46,7 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the performance counter register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTR)
return (msr - MSR_F15H_PERF_CTR) >> 1;
@@ -61,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_PERFCTR0;
}
return 0;
}
@@ -73,6 +79,7 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
{
/* returns the bit offset of the event selection register */
switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
if (msr >= MSR_F15H_PERF_CTL)
return (msr - MSR_F15H_PERF_CTL) >> 1;
@@ -89,20 +96,15 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ case X86_VENDOR_CENTAUR:
+ return msr - MSR_ARCH_PERFMON_EVENTSEL0;
}
return 0;
}
-/* checks for a bit availability (hack for oprofile) */
-int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
-{
- BUG_ON(counter > NMI_MAX_COUNTER_BITS);
-
- return !test_bit(counter, perfctr_nmi_owner);
-}
-EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
-
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 31f0f335ed22..fd6ec2aa0303 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Strings for the various x86 power flags
*
@@ -18,4 +19,6 @@ const char *const x86_power_flags[32] = {
"", /* tsc invariant mapped to constant_tsc */
"cpb", /* core performance boost */
"eff_freq_ro", /* Readonly aperf/mperf */
+ "proc_feedback", /* processor feedback interface */
+ "acc_power", /* accumulated power mechanism */
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index aee6317b902f..6571d432cbe3 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -1,8 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
+
+#include "cpu.h"
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
/*
* Get CPU information for use by the procfs.
@@ -11,15 +20,13 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_core_mask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
+ seq_printf(m, "physical id\t: %d\n", c->topo.pkg_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpumask_weight(topology_core_cpumask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->topo.core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->topo.apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->topo.initial_apicid);
#endif
}
@@ -33,14 +40,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
- "wp\t\t: %s\n",
- static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
- static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
- static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
- static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
+ "wp\t\t: yes\n",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ c->cpuid_level);
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
@@ -72,34 +78,53 @@ static int show_cpuinfo(struct seq_file *m, void *v)
c->x86_model,
c->x86_model_id[0] ? c->x86_model_id : "unknown");
- if (c->x86_mask || c->cpuid_level >= 0)
- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
+ if (c->x86_stepping || c->cpuid_level >= 0)
+ seq_printf(m, "stepping\t: %d\n", c->x86_stepping);
else
- seq_printf(m, "stepping\t: unknown\n");
+ seq_puts(m, "stepping\t: unknown\n");
if (c->microcode)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
+ int freq = arch_freq_get_on_cpu(cpu);
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ if (freq < 0)
+ seq_puts(m, "cpu MHz\t\t: Unknown\n");
+ else
+ seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000));
}
/* Cache size */
- if (c->x86_cache_size >= 0)
- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+ if (c->x86_cache_size)
+ seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
show_cpuinfo_core(m, c, cpu);
show_cpuinfo_misc(m, c);
- seq_printf(m, "flags\t\t:");
+ seq_puts(m, "flags\t\t:");
for (i = 0; i < 32*NCAPINTS; i++)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
+ seq_puts(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
@@ -113,7 +138,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
c->x86_phys_bits, c->x86_virt_bits);
- seq_printf(m, "power management:");
+ seq_puts(m, "power management:");
for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
@@ -126,7 +151,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
}
}
- seq_printf(m, "\n\n");
+ seq_puts(m, "\n\n");
return 0;
}
@@ -155,3 +180,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index feca286c2bb4..eeac00d20926 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -1,73 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This file is part of the Linux kernel.
*
* Copyright (c) 2011, Intel Corporation
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
*/
+#include <linux/printk.h>
#include <asm/processor.h>
#include <asm/archrandom.h>
#include <asm/sections.h>
-static int __init x86_rdrand_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_RDRAND);
- return 1;
-}
-__setup("nordrand", x86_rdrand_setup);
-
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
- int ok;
- asm volatile("1: " RDRAND_LONG "\n\t"
- "jc 2f\n\t"
- "decl %0\n\t"
- "jnz 1b\n\t"
- "2:"
- : "=r" (ok), "=a" (*v)
- : "0" (RDRAND_RETRY_LOOPS));
- return ok;
-}
-
/*
- * Force a reseed cycle; we are architecturally guaranteed a reseed
- * after no more than 512 128-bit chunks of random data. This also
- * acts as a test of the CPU capability.
+ * RDRAND has Built-In-Self-Test (BIST) that runs on every invocation.
+ * Run the instruction a few times as a sanity check. Also make sure
+ * it's not outputting the same value over and over, which has happened
+ * as a result of past CPU bugs.
+ *
+ * If it fails, it is simple to disable RDRAND and RDSEED here.
*/
-#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
-void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c)
+void x86_init_rdrand(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_ARCH_RANDOM
- unsigned long tmp;
- int i, count, ok;
+ enum { SAMPLES = 8, MIN_CHANGE = 5 };
+ unsigned long sample, prev;
+ bool failure = false;
+ size_t i, changed;
if (!cpu_has(c, X86_FEATURE_RDRAND))
- return; /* Nothing to do */
-
- for (count = i = 0; i < RESEED_LOOP; i++) {
- ok = rdrand_long(&tmp);
- if (ok)
- count++;
+ return;
+
+ for (changed = 0, i = 0; i < SAMPLES; ++i) {
+ if (!rdrand_long(&sample)) {
+ failure = true;
+ break;
+ }
+ changed += i && sample != prev;
+ prev = sample;
}
+ if (changed < MIN_CHANGE)
+ failure = true;
- if (count != RESEED_LOOP)
+ if (failure) {
clear_cpu_cap(c, X86_FEATURE_RDRAND);
-#endif
+ clear_cpu_cap(c, X86_FEATURE_RDSEED);
+ pr_emerg("RDRAND is not reliable on this platform; disabling.\n");
+ }
}
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
new file mode 100644
index 000000000000..d8a04b195da2
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
+CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
new file mode 100644
index 000000000000..3792ab4819dc
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -0,0 +1,1079 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+#include <asm/resctrl.h>
+#include "internal.h"
+
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * The cached resctrl_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state);
+
+/*
+ * Global boolean for rdt_alloc which is true if any
+ * resource allocation is enabled.
+ */
+bool rdt_alloc_capable;
+
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
+
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
+
+struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = {
+ [RDT_RESOURCE_L3] =
+ {
+ .r_resctrl = {
+ .name = "L3",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L3_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_L2] =
+ {
+ .r_resctrl = {
+ .name = "L2",
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
+ .schema_fmt = RESCTRL_SCHEMA_BITMAP,
+ },
+ .msr_base = MSR_IA32_L2_CBM_BASE,
+ .msr_update = cat_wrmsr,
+ },
+ [RDT_RESOURCE_MBA] =
+ {
+ .r_resctrl = {
+ .name = "MB",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .name = "SMBA",
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
+ .schema_fmt = RESCTRL_SCHEMA_RANGE,
+ },
+ },
+};
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->mon.num_rmid;
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &rdt_resources_all[l].r_resctrl;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cache mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline void cache_alloc_hsw_probe(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &hw_res->r_resctrl;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
+
+ if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
+ return;
+
+ rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
+
+ /* If all the bits were set in MSR, return success */
+ if (l3_cbm_0 != max_cbm)
+ return;
+
+ hw_res->num_closid = 4;
+ r->cache.cbm_len = 20;
+ r->cache.shareable_bits = 0xc0000;
+ r->cache.min_cbm_bits = 2;
+ r->cache.arch_has_sparse_bitmasks = false;
+ r->alloc_capable = true;
+
+ rdt_alloc_capable = true;
+}
+
+/*
+ * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values
+ * exposed to user interface and the h/w understandable delay values.
+ *
+ * The non-linear delay values have the granularity of power of two
+ * and also the h/w does not guarantee a curve for configured delay
+ * values vs. actual b/w enforced.
+ * Hence we need a mapping that is pre calibrated so the user can
+ * express the memory b/w as a percentage value.
+ */
+static inline bool rdt_get_mb_table(struct rdt_resource *r)
+{
+ /*
+ * There are no Intel SKUs as of now to support non-linear delay.
+ */
+ pr_info("MBA b/w map not implemented for cpu:%d, model:%d",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+ return false;
+}
+
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_3_eax eax;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, ecx, max_delay;
+
+ cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ max_delay = eax.split.max_delay + 1;
+ r->membw.max_bw = MAX_MBA_BW;
+ r->membw.arch_needs_linear = true;
+ if (ecx & MBA_IS_LINEAR) {
+ r->membw.delay_linear = true;
+ r->membw.min_bw = MAX_MBA_BW - max_delay;
+ r->membw.bw_gran = MAX_MBA_BW - max_delay;
+ } else {
+ if (!rdt_get_mb_table(r))
+ return false;
+ r->membw.arch_needs_linear = false;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA))
+ r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
+ else
+ r->membw.throttle_mode = THREAD_THROTTLE_MAX;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 eax, ebx, ecx, edx, subleaf;
+
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
+
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->membw.max_bw = 1 << eax;
+
+ /* AMD does not use delay */
+ r->membw.delay_linear = false;
+ r->membw.arch_needs_linear = false;
+
+ /*
+ * AMD does not use memory delay throttle model to control
+ * the allocation like Intel does.
+ */
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ r->membw.min_bw = 0;
+ r->membw.bw_gran = 1;
+
+ r->alloc_capable = true;
+
+ return true;
+}
+
+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_x_ecx ecx;
+ union cpuid_0x10_x_edx edx;
+ u32 ebx, default_ctrl;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full);
+ hw_res->num_closid = edx.split.cos_max + 1;
+ r->cache.cbm_len = eax.split.cbm_len + 1;
+ default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->cache.shareable_bits = ebx & default_ctrl;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ r->cache.arch_has_sparse_bitmasks = ecx.split.noncont;
+ r->alloc_capable = true;
+}
+
+static void rdt_get_cdp_config(int level)
+{
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ rdt_resources_all[level].cdp_enabled = false;
+ rdt_resources_all[level].r_resctrl.cdp_capable = true;
+}
+
+static void rdt_set_io_alloc_capable(struct rdt_resource *r)
+{
+ r->cache.io_alloc_capable = true;
+}
+
+static void rdt_get_cdp_l3_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L3);
+}
+
+static void rdt_get_cdp_l2_config(void)
+{
+ rdt_get_cdp_config(RDT_RESOURCE_L2);
+}
+
+static void mba_wrmsr_amd(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+/*
+ * Map the memory b/w percentage value to delay values
+ * that can be written to QOS_MSRs.
+ * There are currently no SKUs which support non linear delay values.
+ */
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+{
+ if (r->membw.delay_linear)
+ return MAX_MBA_BW - bw;
+
+ pr_warn_once("Non Linear delay-bw map not supported but queried\n");
+ return MAX_MBA_BW;
+}
+
+static void mba_wrmsr_intel(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ /* Write the delay values for mba. */
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
+}
+
+static void cat_wrmsr(struct msr_param *m)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
+ unsigned int i;
+
+ for (i = m->low; i < m->high; i++)
+ wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
+}
+
+u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->num_closid;
+}
+
+void rdt_ctrl_update(void *arg)
+{
+ struct rdt_hw_resource *hw_res;
+ struct msr_param *m = arg;
+
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
+}
+
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ int i;
+
+ /*
+ * Initialize the Control MSRs to having no control.
+ * For Cache Allocation: Set all bits in cbm
+ * For Memory Allocation: Set b/w requested to 100%
+ */
+ for (i = 0; i < hw_res->num_closid; i++, dc++)
+ *dc = resctrl_get_default_ctrl(r);
+}
+
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
+{
+ int idx;
+
+ for_each_mbm_idx(idx)
+ kfree(hw_dom->arch_mbm_states[idx]);
+ kfree(hw_dom);
+}
+
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct msr_param m;
+ u32 *dc;
+
+ dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
+ GFP_KERNEL);
+ if (!dc)
+ return -ENOMEM;
+
+ hw_dom->ctrl_val = dc;
+ setup_default_ctrlval(r, dc);
+
+ m.res = r;
+ m.dom = d;
+ m.low = 0;
+ m.high = hw_res->num_closid;
+ hw_res->msr_update(&m);
+ return 0;
+}
+
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid: The size of the MBM counter array
+ * @hw_dom: The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
+{
+ size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_states[idx])
+ goto cleanup;
+ }
+
+ return 0;
+cleanup:
+ for_each_mbm_idx(idx) {
+ kfree(hw_dom->arch_mbm_states[idx]);
+ hw_dom->arch_mbm_states[idx] = NULL;
+ }
+
+ return -ENOMEM;
+}
+
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct list_head *add_pos = NULL;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ if (r->cache.arch_has_per_cpu_cfg)
+ rdt_domain_reconfigure_cdp(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ rdt_domain_reconfigure_cdp(r);
+
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+ return;
+ }
+
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
+ return;
+ }
+ d->ci_id = ci->id;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
+
+ /* Update the mbm_assign_mode state for the CPU if supported */
+ if (r->mon.mbm_cntr_assignable)
+ resctrl_arch_mbm_cntr_assign_set_one(r);
+
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_mon_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
+
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+
+ /*
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
+ * its pointer from pseudo_lock_region struct.
+ */
+ if (d->plr)
+ d->plr->d = NULL;
+ ctrl_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
+ }
+
+ hdr = resctrl_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
+}
+
+static void clear_closid_rmid(int cpu)
+{
+ struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
+}
+
+static int resctrl_arch_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+ resctrl_online_cpu(cpu);
+
+ return 0;
+}
+
+static int resctrl_arch_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ mutex_unlock(&domain_list_lock);
+
+ clear_closid_rmid(cpu);
+
+ return 0;
+}
+
+enum {
+ RDT_FLAG_CMT,
+ RDT_FLAG_MBM_TOTAL,
+ RDT_FLAG_MBM_LOCAL,
+ RDT_FLAG_L3_CAT,
+ RDT_FLAG_L3_CDP,
+ RDT_FLAG_L2_CAT,
+ RDT_FLAG_L2_CDP,
+ RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
+ RDT_FLAG_ABMC,
+ RDT_FLAG_SDCIAE,
+};
+
+#define RDT_OPT(idx, n, f) \
+[idx] = { \
+ .name = n, \
+ .flag = f \
+}
+
+struct rdt_options {
+ char *name;
+ int flag;
+ bool force_off, force_on;
+};
+
+static struct rdt_options rdt_options[] __ro_after_init = {
+ RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC),
+ RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
+ RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
+ RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3),
+ RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3),
+ RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
+ RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
+ RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
+ RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC),
+ RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE),
+};
+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
+
+static int __init set_rdt_options(char *str)
+{
+ struct rdt_options *o;
+ bool force_off;
+ char *tok;
+
+ if (*str == '=')
+ str++;
+ while ((tok = strsep(&str, ",")) != NULL) {
+ force_off = *tok == '!';
+ if (force_off)
+ tok++;
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (strcmp(tok, o->name) == 0) {
+ if (force_off)
+ o->force_off = true;
+ else
+ o->force_on = true;
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("rdt", set_rdt_options);
+
+bool rdt_cpu_has(int flag)
+{
+ bool ret = boot_cpu_has(flag);
+ struct rdt_options *o;
+
+ if (!ret)
+ return ret;
+
+ for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
+ if (flag == o->flag) {
+ if (o->force_off)
+ ret = false;
+ if (o->force_on)
+ ret = true;
+ break;
+ }
+ }
+ return ret;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ if (!rdt_cpu_has(X86_FEATURE_BMEC))
+ return false;
+
+ switch (evt) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL);
+ default:
+ return false;
+ }
+}
+
+static __init bool get_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_MBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return __get_mem_config_intel(&hw_res->r_resctrl);
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
+static __init bool get_rdt_alloc_resources(void)
+{
+ struct rdt_resource *r;
+ bool ret = false;
+
+ if (rdt_alloc_capable)
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
+ r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ rdt_get_cache_alloc_cfg(1, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L3))
+ rdt_get_cdp_l3_config();
+ if (rdt_cpu_has(X86_FEATURE_SDCIAE))
+ rdt_set_io_alloc_capable(r);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ r = &rdt_resources_all[RDT_RESOURCE_L2].r_resctrl;
+ rdt_get_cache_alloc_cfg(2, r);
+ if (rdt_cpu_has(X86_FEATURE_CDP_L2))
+ rdt_get_cdp_l2_config();
+ ret = true;
+ }
+
+ if (get_mem_config())
+ ret = true;
+
+ if (get_slow_mem_config())
+ ret = true;
+
+ return ret;
+}
+
+static __init bool get_rdt_mon_resources(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ bool ret = false;
+
+ if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
+ resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+ resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID);
+ ret = true;
+ }
+ if (rdt_cpu_has(X86_FEATURE_ABMC))
+ ret = true;
+
+ if (!ret)
+ return false;
+
+ return !rdt_get_mon_l3_config(r);
+}
+
+static __init void __check_quirks_intel(void)
+{
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
+ if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
+ cache_alloc_hsw_probe();
+ break;
+ case INTEL_SKYLAKE_X:
+ if (boot_cpu_data.x86_stepping <= 4)
+ set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
+ else
+ set_rdt_options("!l3cat");
+ fallthrough;
+ case INTEL_BROADWELL_X:
+ intel_rdt_mbm_apply_quirk();
+ break;
+ }
+}
+
+static __init void check_quirks(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ __check_quirks_intel();
+}
+
+static __init bool get_rdt_resources(void)
+{
+ rdt_alloc_capable = get_rdt_alloc_resources();
+ rdt_mon_capable = get_rdt_mon_resources();
+
+ return (rdt_mon_capable || rdt_alloc_capable);
+}
+
+static __init void rdt_init_res_defs_intel(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_per_cpu_cfg = false;
+ r->cache.min_cbm_bits = 1;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE;
+ hw_res->msr_update = mba_wrmsr_intel;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs_amd(void)
+{
+ struct rdt_hw_resource *hw_res;
+ struct rdt_resource *r;
+
+ for_each_rdt_resource(r) {
+ hw_res = resctrl_to_arch_res(r);
+
+ if (r->rid == RDT_RESOURCE_L3 ||
+ r->rid == RDT_RESOURCE_L2) {
+ r->cache.arch_has_sparse_bitmasks = true;
+ r->cache.arch_has_per_cpu_cfg = true;
+ r->cache.min_cbm_bits = 0;
+ } else if (r->rid == RDT_RESOURCE_MBA) {
+ hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
+ }
+ }
+}
+
+static __init void rdt_init_res_defs(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ rdt_init_res_defs_intel();
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ rdt_init_res_defs_amd();
+}
+
+static enum cpuhp_state rdt_online;
+
+/* Runs once on the BSP during boot. */
+void resctrl_cpu_detect(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ c->x86_cache_mbm_width_offset = -1;
+ return;
+ }
+
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = cpuid_ebx(0xf);
+
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
+ cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) ||
+ cpu_has(c, X86_FEATURE_ABMC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
+ }
+}
+
+static int __init resctrl_arch_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret, i;
+
+ /* for_each_rdt_resource() requires all rid to be initialised. */
+ for (i = 0; i < RDT_NUM_RESOURCES; i++)
+ rdt_resources_all[i].r_resctrl.rid = i;
+
+ /*
+ * Initialize functions(or definitions) that are different
+ * between vendors here.
+ */
+ rdt_init_res_defs();
+
+ check_quirks();
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/resctrl/cat:online:",
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = resctrl_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+ rdt_online = state;
+
+ for_each_alloc_capable_rdt_resource(r)
+ pr_info("%s allocation detected\n", r->name);
+
+ for_each_mon_capable_rdt_resource(r)
+ pr_info("%s monitoring detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(resctrl_arch_late_init);
+
+static void __exit resctrl_arch_exit(void)
+{
+ cpuhp_remove_state(rdt_online);
+
+ resctrl_exit();
+}
+
+__exitcall(resctrl_arch_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
new file mode 100644
index 000000000000..b20e705606b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+
+#include "internal.h"
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ u32 idx = resctrl_get_config_index(closid, t);
+ struct msr_param msr_param;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
+ return -EINVAL;
+
+ hw_dom->ctrl_val[idx] = cfg_val;
+
+ msr_param.res = r;
+ msr_param.dom = d;
+ msr_param.low = idx;
+ msr_param.high = idx + 1;
+ hw_res->msr_update(&msr_param);
+
+ return 0;
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ struct resctrl_staged_config *cfg;
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ enum resctrl_conf_type t;
+ u32 idx;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
+ for (t = 0; t < CDP_NUM_TYPES; t++) {
+ cfg = &hw_dom->d_resctrl.staged_config[t];
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ idx = resctrl_get_config_index(closid, t);
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
+ continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
+
+ if (!msr_param.res) {
+ msr_param.low = idx;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+ msr_param.dom = d;
+ } else {
+ msr_param.low = min(msr_param.low, idx);
+ msr_param.high = max(msr_param.high, idx + 1);
+ }
+ }
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
+ u32 idx = resctrl_get_config_index(closid, type);
+
+ return hw_dom->ctrl_val[idx];
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->sdciae_enabled;
+}
+
+static void resctrl_sdciae_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT);
+}
+
+static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_ctrl_domain *d;
+
+ /* Walking r->ctrl_domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list)
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1);
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (hw_res->r_resctrl.cache.io_alloc_capable &&
+ hw_res->sdciae_enabled != enable) {
+ _resctrl_sdciae_enable(r, enable);
+ hw_res->sdciae_enabled = enable;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
new file mode 100644
index 000000000000..4a916c84a322
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RESCTRL_INTERNAL_H
+#define _ASM_X86_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+#define L2_QOS_CDP_ENABLE 0x01ULL
+
+#define MBM_CNTR_WIDTH_BASE 24
+
+#define MBA_IS_LINEAR 0x4
+
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
+
+#define RMID_VAL_ERROR BIT_ULL(63)
+
+#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+/*
+ * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
+ * data to be returned. The counter width is discovered from the hardware
+ * as an offset from MBM_CNTR_WIDTH_BASE.
+ */
+#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+
+/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ * return value.
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to
+ * find this struct.
+ */
+struct arch_mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */
+#define ABMC_ENABLE_BIT 0
+
+/*
+ * Qos Event Identifiers.
+ */
+#define ABMC_EXTENDED_EVT_ID BIT(31)
+#define ABMC_EVT_ID BIT(0)
+
+/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */
+#define SDCIAE_ENABLE_BIT 1
+
+/**
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
+ * @arch_mbm_states: Per-event pointer to the MBM event's saved state.
+ * An MBM event's state is an array of struct arch_mbm_state
+ * indexed by RMID on x86.
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
+ struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS];
+};
+
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
+{
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
+}
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @dom: The domain to update
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
+ u32 low;
+ u32 high;
+};
+
+/**
+ * struct rdt_hw_resource - arch private attributes of a resctrl resource
+ * @r_resctrl: Attributes of the resource used directly by resctrl.
+ * @num_closid: Maximum number of closid this hardware can support,
+ * regardless of CDP. This is exposed via
+ * resctrl_arch_get_num_closid() to avoid confusion
+ * with struct resctrl_schema's property of the same name,
+ * which has been corrected for features like CDP.
+ * @msr_base: Base MSR address for CBMs
+ * @msr_update: Function pointer to update QOS MSRs
+ * @mon_scale: cqm counter * mon_scale = occupancy in bytes
+ * @mbm_width: Monitor width, to detect and correct for overflow.
+ * @cdp_enabled: CDP state of this resource
+ * @mbm_cntr_assign_enabled: ABMC feature is enabled
+ * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled.
+ *
+ * Members of this structure are either private to the architecture
+ * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g.
+ * msr_update and msr_base.
+ */
+struct rdt_hw_resource {
+ struct rdt_resource r_resctrl;
+ u32 num_closid;
+ unsigned int msr_base;
+ void (*msr_update)(struct msr_param *m);
+ unsigned int mon_scale;
+ unsigned int mbm_width;
+ bool cdp_enabled;
+ bool mbm_cntr_assign_enabled;
+ bool sdciae_enabled;
+};
+
+static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r)
+{
+ return container_of(r, struct rdt_hw_resource, r_resctrl);
+}
+
+extern struct rdt_hw_resource rdt_resources_all[];
+
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
+union cpuid_0x10_3_eax {
+ struct {
+ unsigned int max_delay:12;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).ECX */
+union cpuid_0x10_x_ecx {
+ struct {
+ unsigned int reserved:3;
+ unsigned int noncont:1;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID).EDX */
+union cpuid_0x10_x_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+/*
+ * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG.
+ *
+ * @bw_type : Event configuration that represents the memory
+ * transactions being tracked by the @cntr_id.
+ * @bw_src : Bandwidth source (RMID or CLOSID).
+ * @reserved1 : Reserved.
+ * @is_clos : @bw_src field is a CLOSID (not an RMID).
+ * @cntr_id : Counter identifier.
+ * @reserved : Reserved.
+ * @cntr_en : Counting enable bit.
+ * @cfg_en : Configuration enable bit.
+ *
+ * Configuration and counting:
+ * Counter can be configured across multiple writes to MSR. Configuration
+ * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the
+ * configuration is applied.
+ * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not
+ * count events.
+ * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start
+ * counting events.
+ */
+union l3_qos_abmc_cfg {
+ struct {
+ unsigned long bw_type :32,
+ bw_src :12,
+ reserved1: 3,
+ is_clos : 1,
+ cntr_id : 5,
+ reserved : 9,
+ cntr_en : 1,
+ cfg_en : 1;
+ } split;
+ unsigned long full;
+};
+
+void rdt_ctrl_update(void *arg);
+
+int rdt_get_mon_l3_config(struct rdt_resource *r);
+
+bool rdt_cpu_has(int flag);
+
+void __init intel_rdt_mbm_apply_quirk(void);
+
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r);
+
+#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
new file mode 100644
index 000000000000..dffcc8307500
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -0,0 +1,583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/msr.h>
+
+#include "internal.h"
+
+/*
+ * Global boolean for rdt_monitor which is true if any
+ * resource monitoring is enabled.
+ */
+bool rdt_mon_capable;
+
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+static int snc_nodes_per_l3_cache = 1;
+
+/*
+ * The correction factor table is documented in Documentation/filesystems/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initconst = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
+ rdmsrq(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct arch_mbm_state *state;
+
+ if (!resctrl_is_mbm_event(eventid))
+ return NULL;
+
+ state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)];
+
+ return state ? &state[rmid] : NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u32 prmid;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ /* Record any initial, non-zero count value. */
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
+ }
+}
+
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ enum resctrl_event_id eventid;
+ int idx;
+
+ for_each_mbm_event_id(eventid) {
+ if (!resctrl_is_mon_event_enabled(eventid))
+ continue;
+ idx = MBM_STATE_IDX(eventid);
+ memset(hw_dom->arch_mbm_states[idx], 0,
+ sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid);
+ }
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+ u64 shift = 64 - width, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >> shift;
+}
+
+static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 rmid, enum resctrl_event_id eventid, u64 msr_val)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct arch_mbm_state *am;
+ u64 chunks;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
+
+ return chunks * hw_res->mon_scale;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ struct arch_mbm_state *am;
+ u64 msr_val;
+ u32 prmid;
+ int ret;
+
+ resctrl_arch_rmid_read_context_check();
+
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
+
+ if (!ret) {
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+ } else if (ret == -EINVAL) {
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ am->prev_msr = 0;
+ }
+
+ return ret;
+}
+
+static int __cntr_id_read(u32 cntr_id, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * QM_EVTSEL Register definition:
+ * =======================================================
+ * Bits Mnemonic Description
+ * =======================================================
+ * 63:44 -- Reserved
+ * 43:32 RMID RMID or counter ID in ABMC mode
+ * when reading an MBM event
+ * 31 ExtendedEvtID Extended Event Identifier
+ * 30:8 -- Reserved
+ * 7:0 EvtID Event Identifier
+ * =======================================================
+ * The contents of a specific counter can be read by setting the
+ * following fields in QM_EVTSEL.ExtendedEvtID(=1) and
+ * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID
+ * to the desired counter ID. Reading the QM_CTR then returns the
+ * contents of the specified counter. The RMID_VAL_ERROR bit is set
+ * if the counter configuration is invalid, or if an invalid counter
+ * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit
+ * is set if the counter data is unavailable.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ struct arch_mbm_state *am;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ memset(am, 0, sizeof(*am));
+
+ /* Record any initial, non-zero count value. */
+ __cntr_id_read(cntr_id, &am->prev_msr);
+ }
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+ int ret;
+
+ ret = __cntr_id_read(cntr_id, &msr_val);
+ if (ret)
+ return ret;
+
+ *val = get_corrected_val(r, d, rmid, eventid, msr_val);
+
+ return 0;
+}
+
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
+{
+ unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ unsigned int threshold;
+ u32 eax, ebx, ecx, edx;
+
+ snc_nodes_per_l3_cache = snc_get_config();
+
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
+ hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
+
+ if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
+ hw_res->mbm_width += mbm_offset;
+ else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
+ pr_warn("Ignoring impossible MBM counter offset\n");
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid;
+
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
+
+ if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) {
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+ }
+
+ /*
+ * resctrl assumes a system that supports assignable counters can
+ * switch to "default" mode. Ensure that there is a "default" mode
+ * to switch to. This enforces a dependency between the independent
+ * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL
+ * hardware features.
+ */
+ if (rdt_cpu_has(X86_FEATURE_ABMC) &&
+ (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) ||
+ rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) {
+ r->mon.mbm_cntr_assignable = true;
+ cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx);
+ r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1;
+ hw_res->mbm_cntr_assign_enabled = true;
+ }
+
+ r->mon_capable = true;
+
+ return 0;
+}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
+
+static void resctrl_abmc_set_one_amd(void *arg)
+{
+ bool *enable = arg;
+
+ if (*enable)
+ msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+ else
+ msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT);
+}
+
+/*
+ * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs
+ * associated with all monitor domains.
+ */
+static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd,
+ &enable, 1);
+ resctrl_arch_reset_rmid_all(r, d);
+ }
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (r->mon.mbm_cntr_assignable &&
+ hw_res->mbm_cntr_assign_enabled != enable) {
+ _resctrl_abmc_enable(r, enable);
+ hw_res->mbm_cntr_assign_enabled = enable;
+ }
+
+ return 0;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+ return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled;
+}
+
+static void resctrl_abmc_config_one_amd(void *info)
+{
+ union l3_qos_abmc_cfg *abmc_cfg = info;
+
+ wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+}
+
+/*
+ * Send an IPI to the domain to assign the counter to RMID, event pair.
+ */
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ union l3_qos_abmc_cfg abmc_cfg = { 0 };
+ struct arch_mbm_state *am;
+
+ abmc_cfg.split.cfg_en = 1;
+ abmc_cfg.split.cntr_en = assign ? 1 : 0;
+ abmc_cfg.split.cntr_id = cntr_id;
+ abmc_cfg.split.bw_src = rmid;
+ if (assign)
+ abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid);
+
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1);
+
+ /*
+ * The hardware counter is reset (because cfg_en == 1) so there is no
+ * need to record initial non-zero counts.
+ */
+ am = get_arch_mbm_state(hw_dom, rmid, evtid);
+ if (am)
+ memset(am, 0, sizeof(*am));
+}
+
+void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled);
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
new file mode 100644
index 000000000000..de580eca3363
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheflush.h>
+#include <linux/cpu.h>
+#include <linux/perf_event.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#include "../../events/perf_event.h" /* For X86_CONFIG() */
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "pseudo_lock_trace.h"
+
+/*
+ * The bits needed to disable hardware prefetching varies based on the
+ * platform. During initialization we will discover which bits to use.
+ */
+static u64 prefetch_disable_bits;
+
+/**
+ * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported
+ * platforms
+ * @void: It takes no parameters.
+ *
+ * Capture the list of platforms that have been validated to support
+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
+ * with low cache miss rates can be created under variety of load conditions
+ * as well as that these pseudo-locked regions can maintain their low cache
+ * miss rates under variety of load conditions for significant lengths of time.
+ *
+ * After a platform has been validated to support pseudo-locking its
+ * hardware prefetch disable bits are included here as they are documented
+ * in the SDM.
+ *
+ * When adding a platform here also add support for its cache events to
+ * resctrl_arch_measure_l*_residency()
+ *
+ * Return:
+ * If platform is supported, the bits to disable hardware prefetchers, 0
+ * if platform is not supported.
+ */
+u64 resctrl_arch_get_prefetch_disable_bits(void)
+{
+ prefetch_disable_bits = 0;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return 0;
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 L2 Adjacent Cache Line Prefetcher Disable (R/W)
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 3 DCU IP Prefetcher Disable (R/W)
+ * 63:4 Reserved
+ */
+ prefetch_disable_bits = 0xF;
+ break;
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ /*
+ * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
+ * as:
+ * 0 L2 Hardware Prefetcher Disable (R/W)
+ * 1 Reserved
+ * 2 DCU Hardware Prefetcher Disable (R/W)
+ * 63:3 Reserved
+ */
+ prefetch_disable_bits = 0x5;
+ break;
+ }
+
+ return prefetch_disable_bits;
+}
+
+/**
+ * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache
+ * @_plr: the pseudo-lock region descriptor
+ *
+ * This is the core pseudo-locking flow.
+ *
+ * First we ensure that the kernel memory cannot be found in the cache.
+ * Then, while taking care that there will be as little interference as
+ * possible, the memory to be loaded is accessed while core is running
+ * with class of service set to the bitmask of the pseudo-locked region.
+ * After this is complete no future CAT allocations will be allowed to
+ * overlap with this bitmask.
+ *
+ * Local register variables are utilized to ensure that the memory region
+ * to be locked is the only memory access made during the critical locking
+ * loop.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_pseudo_lock_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 rmid_p, closid_p;
+ unsigned long i;
+ u64 saved_msr;
+#ifdef CONFIG_KASAN
+ /*
+ * The registers used for local register variables are also used
+ * when KASAN is active. When KASAN is active we use a regular
+ * variable to ensure we always use a valid pointer, but the cost
+ * is that this variable will enter the cache through evicting the
+ * memory we are trying to lock into the cache. Thus expect lower
+ * pseudo-locking success rate when KASAN is active.
+ */
+ unsigned int line_size;
+ unsigned int size;
+ void *mem_r;
+#else
+ register unsigned int line_size asm("esi");
+ register unsigned int size asm("edi");
+ register void *mem_r asm(_ASM_BX);
+#endif /* CONFIG_KASAN */
+
+ /*
+ * Make sure none of the allocated memory is cached. If it is we
+ * will get a cache hit in below loop from outside of pseudo-locked
+ * region.
+ * wbinvd (as opposed to clflush/clflushopt) is required to
+ * increase likelihood that allocated cache portion will be filled
+ * with associated memory.
+ */
+ wbinvd();
+
+ /*
+ * Always called with interrupts enabled. By disabling interrupts
+ * ensure that we will not be preempted during this critical section.
+ */
+ local_irq_disable();
+
+ /*
+ * Call wrmsr and rdmsr as directly as possible to avoid tracing
+ * clobbering local register variables or affecting cache accesses.
+ *
+ * Disable the hardware prefetcher so that when the end of the memory
+ * being pseudo-locked is reached the hardware will not read beyond
+ * the buffer and evict pseudo-locked memory read earlier from the
+ * cache.
+ */
+ saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL);
+ native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ closid_p = this_cpu_read(pqr_state.cur_closid);
+ rmid_p = this_cpu_read(pqr_state.cur_rmid);
+ mem_r = plr->kmem;
+ size = plr->size;
+ line_size = plr->line_size;
+ /*
+ * Critical section begin: start by writing the closid associated
+ * with the capacity bitmask of the cache region being
+ * pseudo-locked followed by reading of kernel memory to load it
+ * into the cache.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid);
+
+ /*
+ * Cache was flushed earlier. Now access kernel memory to read it
+ * into cache region associated with just activated plr->closid.
+ * Loop over data twice:
+ * - In first loop the cache region is shared with the page walker
+ * as it populates the paging structure caches (including TLB).
+ * - In the second loop the paging structure caches are used and
+ * cache region is populated with the memory being referenced.
+ */
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Critical section end: restore closid with capacity bitmask that
+ * does not overlap with pseudo-locked region.
+ */
+ native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
+
+ /* Re-enable the hardware prefetcher(s) */
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr);
+ local_irq_enable();
+
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/**
+ * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read
+ * pseudo-locked memory
+ * @_plr: pseudo-lock region to measure
+ *
+ * There is no deterministic way to test if a memory region is cached. One
+ * way is to measure how long it takes to read the memory, the speed of
+ * access is a good way to learn how close to the cpu the data was. Even
+ * more, if the prefetcher is disabled and the memory is read at a stride
+ * of half the cache line, then a cache miss will be easy to spot since the
+ * read of the first half would be significantly slower than the read of
+ * the second half.
+ *
+ * Return: 0. Waiter on waitqueue will be woken on completion.
+ */
+int resctrl_arch_measure_cycles_lat_fn(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ u32 saved_low, saved_high;
+ unsigned long i;
+ u64 start, end;
+ void *mem_r;
+
+ local_irq_disable();
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+ mem_r = READ_ONCE(plr->kmem);
+ /*
+ * Dummy execute of the time measurement to load the needed
+ * instructions into the L1 instruction cache.
+ */
+ start = rdtsc_ordered();
+ for (i = 0; i < plr->size; i += 32) {
+ start = rdtsc_ordered();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ end = rdtsc_ordered();
+ trace_pseudo_lock_mem_latency((u32)(end - start));
+ }
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+/*
+ * Create a perf_event_attr for the hit and miss perf events that will
+ * be used during the performance measurement. A perf_event maintains
+ * a pointer to its perf_event_attr so a unique attribute structure is
+ * created for each perf_event.
+ *
+ * The actual configuration of the event is set right before use in order
+ * to use the X86_CONFIG macro.
+ */
+static struct perf_event_attr perf_miss_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+static struct perf_event_attr perf_hit_attr = {
+ .type = PERF_TYPE_RAW,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 0,
+ .exclude_user = 1,
+};
+
+struct residency_counts {
+ u64 miss_before, hits_before;
+ u64 miss_after, hits_after;
+};
+
+static int measure_residency_fn(struct perf_event_attr *miss_attr,
+ struct perf_event_attr *hit_attr,
+ struct pseudo_lock_region *plr,
+ struct residency_counts *counts)
+{
+ u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
+ struct perf_event *miss_event, *hit_event;
+ int hit_pmcnum, miss_pmcnum;
+ u32 saved_low, saved_high;
+ unsigned int line_size;
+ unsigned int size;
+ unsigned long i;
+ void *mem_r;
+ u64 tmp;
+
+ miss_event = perf_event_create_kernel_counter(miss_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(miss_event))
+ goto out;
+
+ hit_event = perf_event_create_kernel_counter(hit_attr, plr->cpu,
+ NULL, NULL, NULL);
+ if (IS_ERR(hit_event))
+ goto out_miss;
+
+ local_irq_disable();
+ /*
+ * Check any possible error state of events used by performing
+ * one local read.
+ */
+ if (perf_event_read_local(miss_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+ if (perf_event_read_local(hit_event, &tmp, NULL, NULL)) {
+ local_irq_enable();
+ goto out_hit;
+ }
+
+ /*
+ * Disable hardware prefetchers.
+ */
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits);
+
+ /* Initialize rest of local variables */
+ /*
+ * Performance event has been validated right before this with
+ * interrupts disabled - it is thus safe to read the counter index.
+ */
+ miss_pmcnum = x86_perf_rdpmc_index(miss_event);
+ hit_pmcnum = x86_perf_rdpmc_index(hit_event);
+ line_size = READ_ONCE(plr->line_size);
+ mem_r = READ_ONCE(plr->kmem);
+ size = READ_ONCE(plr->size);
+
+ /*
+ * Read counter variables twice - first to load the instructions
+ * used in L1 cache, second to capture accurate value that does not
+ * include cache misses incurred because of instruction loads.
+ */
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * From SDM: Performing back-to-back fast reads are not guaranteed
+ * to be monotonic.
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_before = rdpmc(hit_pmcnum);
+ miss_before = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ for (i = 0; i < size; i += line_size) {
+ /*
+ * Add a barrier to prevent speculative execution of this
+ * loop reading beyond the end of the buffer.
+ */
+ rmb();
+ asm volatile("mov (%0,%1,1), %%eax\n\t"
+ :
+ : "r" (mem_r), "r" (i)
+ : "%eax", "memory");
+ }
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ hits_after = rdpmc(hit_pmcnum);
+ miss_after = rdpmc(miss_pmcnum);
+ /*
+ * Use LFENCE to ensure all previous instructions are retired
+ * before proceeding.
+ */
+ rmb();
+ /* Re-enable hardware prefetchers */
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
+ local_irq_enable();
+out_hit:
+ perf_event_release_kernel(hit_event);
+out_miss:
+ perf_event_release_kernel(miss_event);
+out:
+ /*
+ * All counts will be zero on failure.
+ */
+ counts->miss_before = miss_before;
+ counts->hits_before = hits_before;
+ counts->miss_after = miss_after;
+ counts->hits_after = hits_after;
+ return 0;
+}
+
+int resctrl_arch_measure_l2_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * Non-architectural event for the Goldmont Microarchitecture
+ * from Intel x86 Architecture Software Developer Manual (SDM):
+ * MEM_LOAD_UOPS_RETIRED D1H (event number)
+ * Umask values:
+ * L2_HIT 02H
+ * L2_MISS 10H
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
+ perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x10);
+ perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
+ .umask = 0x2);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+ trace_pseudo_lock_l2(counts.hits_after - counts.hits_before,
+ counts.miss_after - counts.miss_before);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
+
+int resctrl_arch_measure_l3_residency(void *_plr)
+{
+ struct pseudo_lock_region *plr = _plr;
+ struct residency_counts counts = {0};
+
+ /*
+ * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
+ * has two "no fix" errata associated with it: BDM35 and BDM100. On
+ * this platform the following events are used instead:
+ * LONGEST_LAT_CACHE 2EH (Documented in SDM)
+ * REFERENCE 4FH
+ * MISS 41H
+ */
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
+ /* On BDW the hit event counts references, not hits */
+ perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x4f);
+ perf_miss_attr.config = X86_CONFIG(.event = 0x2e,
+ .umask = 0x41);
+ break;
+ default:
+ goto out;
+ }
+
+ measure_residency_fn(&perf_miss_attr, &perf_hit_attr, plr, &counts);
+ /*
+ * If a failure prevented the measurements from succeeding
+ * tracepoints will still be written and all counts will be zero.
+ */
+
+ counts.miss_after -= counts.miss_before;
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
+ /*
+ * On BDW references and misses are counted, need to adjust.
+ * Sometimes the "hits" counter is a bit more than the
+ * references, for example, x references but x + 1 hits.
+ * To not report invalid hit values in this case we treat
+ * that as misses equal to references.
+ */
+ /* First compute the number of cache references measured */
+ counts.hits_after -= counts.hits_before;
+ /* Next convert references to cache hits */
+ counts.hits_after -= min(counts.miss_after, counts.hits_after);
+ } else {
+ counts.hits_after -= counts.hits_before;
+ }
+
+ trace_pseudo_lock_l3(counts.hits_after, counts.miss_after);
+out:
+ plr->thread_done = 1;
+ wake_up_interruptible(&plr->lock_thread_wq);
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
new file mode 100644
index 000000000000..7c8aef08010f
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(pseudo_lock_mem_latency,
+ TP_PROTO(u32 latency),
+ TP_ARGS(latency),
+ TP_STRUCT__entry(__field(u32, latency)),
+ TP_fast_assign(__entry->latency = latency),
+ TP_printk("latency=%u", __entry->latency)
+ );
+
+TRACE_EVENT(pseudo_lock_l2,
+ TP_PROTO(u64 l2_hits, u64 l2_miss),
+ TP_ARGS(l2_hits, l2_miss),
+ TP_STRUCT__entry(__field(u64, l2_hits)
+ __field(u64, l2_miss)),
+ TP_fast_assign(__entry->l2_hits = l2_hits;
+ __entry->l2_miss = l2_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l2_hits, __entry->l2_miss));
+
+TRACE_EVENT(pseudo_lock_l3,
+ TP_PROTO(u64 l3_hits, u64 l3_miss),
+ TP_ARGS(l3_hits, l3_miss),
+ TP_STRUCT__entry(__field(u64, l3_hits)
+ __field(u64, l3_miss)),
+ TP_fast_assign(__entry->l3_hits = l3_hits;
+ __entry->l3_miss = l3_miss;),
+ TP_printk("hits=%llu miss=%llu",
+ __entry->l3_hits, __entry->l3_miss));
+
+#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE pseudo_lock_trace
+
+#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
new file mode 100644
index 000000000000..885026468440
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/msr.h>
+#include "internal.h"
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
+
+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/*
+ * This is safe against resctrl_arch_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from update_closid_rmid() is protected against __switch_to() because
+ * preemption is disabled.
+ */
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+ struct resctrl_cpu_defaults *r = info;
+
+ if (r) {
+ this_cpu_write(pqr_state.default_closid, r->closid);
+ this_cpu_write(pqr_state.default_rmid, r->rmid);
+ }
+
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ resctrl_arch_sched_in(current);
+}
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+void resctrl_arch_mon_event_config_read(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+ u64 msrval;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+ /* Report only the valid event configuration bits */
+ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+void resctrl_arch_mon_event_config_write(void *_config_info)
+{
+ struct resctrl_mon_config_info *config_info = _config_info;
+ unsigned int index;
+
+ index = mon_event_config_index_get(config_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", config_info->evtid);
+ return;
+ }
+ wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config);
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static void l2_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_cache_qos_cfg(int level, bool enable)
+{
+ void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
+ struct rdt_resource *r_l;
+ cpumask_var_t cpu_mask;
+ int cpu;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ if (level == RDT_RESOURCE_L3)
+ update = l3_qos_cfg_update;
+ else if (level == RDT_RESOURCE_L2)
+ update = l2_qos_cfg_update;
+ else
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ r_l = &rdt_resources_all[level].r_resctrl;
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
+ if (r_l->cache.arch_has_per_cpu_cfg)
+ /* Pick all the CPUs in the domain instance */
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
+ cpumask_set_cpu(cpu, cpu_mask);
+ else
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
+ }
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+
+ if (!r->cdp_capable)
+ return;
+
+ if (r->rid == RDT_RESOURCE_L2)
+ l2_qos_cfg_update(&hw_res->cdp_enabled);
+
+ if (r->rid == RDT_RESOURCE_L3)
+ l3_qos_cfg_update(&hw_res->cdp_enabled);
+}
+
+static int cdp_enable(int level)
+{
+ struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl;
+ int ret;
+
+ if (!r_l->alloc_capable)
+ return -EINVAL;
+
+ ret = set_cache_qos_cfg(level, true);
+ if (!ret)
+ rdt_resources_all[level].cdp_enabled = true;
+
+ return ret;
+}
+
+static void cdp_disable(int level)
+{
+ struct rdt_hw_resource *r_hw = &rdt_resources_all[level];
+
+ if (r_hw->cdp_enabled) {
+ set_cache_qos_cfg(level, false);
+ r_hw->cdp_enabled = false;
+ }
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[l];
+
+ if (!hw_res->r_resctrl.cdp_capable)
+ return -EINVAL;
+
+ if (enable)
+ return cdp_enable(l);
+
+ cdp_disable(l);
+
+ return 0;
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
+{
+ return rdt_resources_all[l].cdp_enabled;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
+ int i;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = hw_res->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ for (i = 0; i < hw_res->num_closid; i++)
+ hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r);
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
+ }
+
+ return;
+}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d92b5dad15dd..42c7eac0c387 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -1,13 +1,14 @@
/*
- * Routines to indentify additional cpu features that are scattered in
+ * Routines to identify additional cpu features that are scattered in
* cpuid space.
*/
#include <linux/cpu.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
+#include <asm/apic.h>
#include <asm/processor.h>
-#include <asm/apic.h>
+#include "cpu.h"
struct cpuid_bit {
u16 feature;
@@ -17,44 +18,63 @@ struct cpuid_bit {
u32 sub_leaf;
};
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
+/*
+ * Please keep the leaf sorted by cpuid_bit.level for faster search.
+ * X86_FEATURE_MBA is supported by both Intel and AMD. But the CPUID
+ * levels are different and there is a separate entry for each.
+ */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 },
+ { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 },
+ { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 },
+ { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 },
+ { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 },
+ { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 },
+ { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 },
+ { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 },
+ { 0, 0, 0, 0, 0 }
};
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{
u32 max_level;
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 },
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
- { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
- { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
- { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
- { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
- { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
- { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
- { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
- { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
- { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
- { 0, 0, 0, 0, 0 }
- };
-
for (cb = cpuid_bits; cb->feature; cb++) {
/* Verify that the level is valid */
@@ -63,8 +83,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
max_level > (cb->level | 0xffff))
continue;
- cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
- &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..9c1656779b2a
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,6 @@
+obj-y += \
+ driver.o \
+ encl.o \
+ ioctl.o \
+ main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..a42c7180900b
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_misc_reserved_mask;
+
+static int __sgx_open(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl;
+ int ret;
+
+ encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+ if (!encl)
+ return -ENOMEM;
+
+ kref_init(&encl->refcount);
+ xa_init(&encl->page_array);
+ mutex_init(&encl->lock);
+ INIT_LIST_HEAD(&encl->va_pages);
+ INIT_LIST_HEAD(&encl->mm_list);
+ spin_lock_init(&encl->mm_lock);
+
+ ret = init_srcu_struct(&encl->srcu);
+ if (ret) {
+ kfree(encl);
+ return ret;
+ }
+
+ file->private_data = encl;
+
+ return 0;
+}
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Drain the remaining mm_list entries. At this point the list contains
+ * entries for processes, which have closed the enclave file but have
+ * not exited yet. The processes, which have exited, are gone from the
+ * list by sgx_mmu_notifier_release().
+ */
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+
+ /* 'encl_mm' is gone, put encl_mm->encl reference: */
+ kref_put(&encl->refcount, sgx_encl_release);
+ }
+
+ kref_put(&encl->refcount, sgx_encl_release);
+ return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = file->private_data;
+ int ret;
+
+ ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+ if (ret)
+ return ret;
+
+ ret = sgx_encl_mm_add(encl, vma->vm_mm);
+ if (ret)
+ return ret;
+
+ vma->vm_ops = &sgx_vm_ops;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
+ vma->vm_private_data = encl;
+
+ return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if ((flags & MAP_TYPE) == MAP_PRIVATE)
+ return -EINVAL;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ return mm_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations sgx_encl_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_open,
+ .release = sgx_release,
+ .unlocked_ioctl = sgx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sgx_compat_ioctl,
+#endif
+ .mmap = sgx_mmap,
+ .get_unmapped_area = sgx_get_unmapped_area,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_enclave",
+ .nodename = "sgx_enclave",
+ .fops = &sgx_encl_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 attr_mask;
+ u64 xfrm_mask;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ if (!(eax & 1)) {
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
+ return -ENODEV;
+ }
+
+ sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+
+ cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+ attr_mask = (((u64)ebx) << 32) + (u64)eax;
+ sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+ if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+ sgx_xfrm_reserved_mask = ~xfrm_mask;
+ }
+
+ ret = misc_register(&sgx_dev_enclave);
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..30f39f92c98f
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT 20
+#define SGX_EINIT_SLEEP_COUNT 50
+#define SGX_EINIT_SLEEP_TIME 20
+
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_misc_reserved_mask;
+
+extern const struct file_operations sgx_provision_fops;
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..cf149b9f4916
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,1326 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include <asm/sgx.h>
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
+/*
+ * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
+ * determine the page index associated with the first PCMD entry
+ * within a PCMD page.
+ */
+#define PCMD_FIRST_MASK GENMASK(4, 0)
+
+/**
+ * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
+ * a PCMD page is in process of being reclaimed.
+ * @encl: Enclave to which PCMD page belongs
+ * @start_addr: Address of enclave page using first entry within the PCMD page
+ *
+ * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
+ * stored. The PCMD data of a reclaimed enclave page contains enough
+ * information for the processor to verify the page at the time
+ * it is loaded back into the Enclave Page Cache (EPC).
+ *
+ * The backing storage to which enclave pages are reclaimed is laid out as
+ * follows:
+ * Encrypted enclave pages:SECS page:PCMD pages
+ *
+ * Each PCMD page contains the PCMD metadata of
+ * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
+ *
+ * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
+ * process of getting data (and thus soon being non-empty). (b) is tested with
+ * a check if an enclave page sharing the PCMD page is in the process of being
+ * reclaimed.
+ *
+ * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
+ * intends to reclaim that enclave page - it means that the PCMD page
+ * associated with that enclave page is about to get some data and thus
+ * even if the PCMD page is empty, it should not be truncated.
+ *
+ * Context: Enclave mutex (&sgx_encl->lock) must be held.
+ * Return: 1 if the reclaimer is about to write to the PCMD page
+ * 0 if the reclaimer has no intention to write to the PCMD page
+ */
+static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
+ unsigned long start_addr)
+{
+ int reclaimed = 0;
+ int i;
+
+ /*
+ * PCMD_FIRST_MASK is based on number of PCMD entries within
+ * PCMD page being 32.
+ */
+ BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
+
+ for (i = 0; i < PCMDS_PER_PAGE; i++) {
+ struct sgx_encl_page *entry;
+ unsigned long addr;
+
+ addr = start_addr + i * PAGE_SIZE;
+
+ /*
+ * Stop when reaching the SECS page - it does not
+ * have a page_array entry and its reclaim is
+ * started and completed with enclave mutex held so
+ * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
+ * flag.
+ */
+ if (addr == encl->base + encl->size)
+ break;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ continue;
+
+ /*
+ * VA page slot ID uses same bit as the flag so it is important
+ * to ensure that the page is not already in backing store.
+ */
+ if (entry->epc_page &&
+ (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
+ reclaimed = 1;
+ break;
+ }
+ }
+
+ return reclaimed;
+}
+
+/*
+ * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
+ * follow right after the EPC data in the backing storage. In addition to the
+ * visible enclave pages, there's one extra page slot for SECS, before PCMD
+ * structs.
+ */
+static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
+ unsigned long page_index)
+{
+ pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
+
+ return epc_end_off + page_index * sizeof(struct sgx_pcmd);
+}
+
+/*
+ * Free a page from the backing storage in the given page index.
+ */
+static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
+{
+ struct inode *inode = file_inode(encl->backing);
+
+ shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
+}
+
+/*
+ * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
+ * Pages" in the SDM.
+ */
+static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_epc_page *secs_page)
+{
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ pgoff_t page_index, page_pcmd_off;
+ unsigned long pcmd_first_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_backing b;
+ bool pcmd_page_empty;
+ u8 *pcmd_page;
+ int ret;
+
+ if (secs_page)
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ else
+ page_index = PFN_DOWN(encl->size);
+
+ /*
+ * Address of enclave page using the first entry within the PCMD page.
+ */
+ pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
+
+ page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+
+ ret = sgx_encl_lookup_backing(encl, page_index, &b);
+ if (ret)
+ return ret;
+
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
+ pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
+
+ if (secs_page)
+ pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
+ else
+ pginfo.secs = 0;
+
+ ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
+ sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ELDU");
+
+ ret = -EFAULT;
+ }
+
+ memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
+ set_page_dirty(b.pcmd);
+
+ /*
+ * The area for the PCMD in the page was zeroed above. Check if the
+ * whole page is now empty meaning that all PCMD's have been zeroed:
+ */
+ pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
+
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ get_page(b.pcmd);
+ sgx_encl_put_backing(&b);
+
+ sgx_encl_truncate_backing_page(encl, page_index);
+
+ if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
+ sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ pcmd_page = kmap_local_page(b.pcmd);
+ if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
+ pr_warn("PCMD page not empty after truncate.\n");
+ kunmap_local(pcmd_page);
+ }
+
+ put_page(b.pcmd);
+
+ return ret;
+}
+
+static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *secs_page)
+{
+
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page))
+ return epc_page;
+
+ ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
+ if (ret) {
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(ret);
+ }
+
+ sgx_free_va_slot(encl_page->va_page, va_offset);
+ list_move(&encl_page->va_page->list, &encl->va_pages);
+ encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ encl_page->epc_page = epc_page;
+
+ return epc_page;
+}
+
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
+ struct sgx_encl_page *entry)
+{
+ struct sgx_epc_page *epc_page;
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr)
+{
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ return __sgx_encl_load_page(encl, entry);
+}
+
+/**
+ * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
+ * @vma: VMA obtained from fault info from where page is accessed
+ * @encl: enclave accessing the page
+ * @addr: address that triggered the page fault
+ *
+ * When an initialized enclave accesses a page with no backing EPC page
+ * on a SGX2 system then the EPC can be added dynamically via the SGX2
+ * ENCLS[EAUG] instruction.
+ *
+ * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
+ * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
+ */
+static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
+ struct sgx_encl *encl, unsigned long addr)
+{
+ vm_fault_t vmret = VM_FAULT_SIGBUS;
+ struct sgx_pageinfo pginfo = {0};
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ unsigned long phys_addr;
+ u64 secinfo_flags;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * Ignore internal permission checking for dynamically added pages.
+ * They matter only for data added during the pre-initialization
+ * phase. The enclave decides the permissions by the means of
+ * EACCEPT, EACCEPTCOPY and EMODPE.
+ */
+ secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+ encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
+ if (IS_ERR(encl_page))
+ return VM_FAULT_OOM;
+
+ mutex_lock(&encl->lock);
+
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
+ va_page = sgx_encl_grow(encl, false);
+ if (IS_ERR(va_page)) {
+ if (PTR_ERR(va_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_epc;
+ }
+
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ /*
+ * If ret == -EBUSY then page was created in another flow while
+ * running without encl->lock
+ */
+ if (ret)
+ goto err_out_shrink;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = 0;
+
+ ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
+ if (ret)
+ goto err_out;
+
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = SGX_PAGE_TYPE_REG;
+ encl->secs_child_cnt++;
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+
+ phys_addr = sgx_get_epc_phys_addr(epc_page);
+ /*
+ * Do not undo everything when creating PTE entry fails - next #PF
+ * would find page ready for a PTE.
+ */
+ vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (vmret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&encl->lock);
+ return VM_FAULT_NOPAGE;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+err_out_epc:
+ sgx_encl_free_epc_page(epc_page);
+err_out_unlock:
+ mutex_unlock(&encl->lock);
+ kfree(encl_page);
+
+ return vmret;
+}
+
+static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
+{
+ unsigned long addr = (unsigned long)vmf->address;
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_encl_page *entry;
+ unsigned long phys_addr;
+ struct sgx_encl *encl;
+ vm_fault_t ret;
+
+ encl = vma->vm_private_data;
+
+ /*
+ * It's very unlikely but possible that allocating memory for the
+ * mm_list entry of a forked process failed in sgx_vma_open(). When
+ * this happens, vm_private_data is set to NULL.
+ */
+ if (unlikely(!encl))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * The page_array keeps track of all enclave pages, whether they
+ * are swapped out or not. If there is no entry for this page and
+ * the system supports SGX2 then it is possible to dynamically add
+ * a new enclave page. This is only possible for an initialized
+ * enclave that will be checked for right away.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
+ (!xa_load(&encl->page_array, PFN_DOWN(addr))))
+ return sgx_encl_eaug_page(vma, encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
+ if (IS_ERR(entry)) {
+ mutex_unlock(&encl->lock);
+
+ if (PTR_ERR(entry) == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
+
+ ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (ret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ sgx_encl_test_and_clear_young(vma->vm_mm, entry);
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+
+ /*
+ * It's possible but unlikely that vm_private_data is NULL. This can
+ * happen in a grandchild of a process, when sgx_encl_mm_add() had
+ * failed to allocate memory in this callback.
+ */
+ if (unlikely(!encl))
+ return;
+
+ if (sgx_encl_mm_add(encl, vma->vm_mm))
+ vma->vm_private_data = NULL;
+}
+
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl: an enclave pointer
+ * @start: lower bound of the address range, inclusive
+ * @end: upper bound of the address range, exclusive
+ * @vm_flags: VMA flags
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
+ * do not contain any permissions that are not contained in the build time
+ * permissions of any of the enclave pages within the given address range.
+ *
+ * An enclave creator must declare the strongest permissions that will be
+ * needed for each enclave page. This ensures that mappings have the identical
+ * or weaker permissions than the earlier declared permissions.
+ *
+ * Return: 0 on success, -EACCES otherwise
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags)
+{
+ vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
+ struct sgx_encl_page *page;
+ unsigned long count = 0;
+ int ret = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+
+ /* Disallow mapping outside enclave's address range. */
+ if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
+ (start < encl->base || end > encl->base + encl->size))
+ return -EACCES;
+
+ /*
+ * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+ * conflict with the enclave page permissions.
+ */
+ if (current->personality & READ_IMPLIES_EXEC)
+ return -EACCES;
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
+ if (~page->vm_max_prot_bits & vm_prot_bits) {
+ ret = -EACCES;
+ break;
+ }
+
+ /* Reschedule on every XA_CHECK_SCHED iteration. */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ cond_resched();
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ return ret;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+}
+
+static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+
+ ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+ ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Load an enclave page to EPC if required, and take encl->lock.
+ */
+static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
+ unsigned long addr,
+ vm_flags_t vm_flags)
+{
+ struct sgx_encl_page *entry;
+
+ for ( ; ; ) {
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
+ if (PTR_ERR(entry) != -EBUSY)
+ break;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ if (IS_ERR(entry))
+ mutex_unlock(&encl->lock);
+
+ return entry;
+}
+
+static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+ struct sgx_encl_page *entry = NULL;
+ char data[sizeof(unsigned long)];
+ unsigned long align;
+ int offset;
+ int cnt;
+ int ret = 0;
+ int i;
+
+ /*
+ * If process was forked, VMA is still there but vm_private_data is set
+ * to NULL.
+ */
+ if (!encl)
+ return -EFAULT;
+
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ return -EFAULT;
+
+ for (i = 0; i < len; i += cnt) {
+ entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
+ vma->vm_flags);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ break;
+ }
+
+ align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
+ offset = (addr + i) & (sizeof(unsigned long) - 1);
+ cnt = sizeof(unsigned long) - offset;
+ cnt = min(cnt, len - i);
+
+ ret = sgx_encl_debug_read(encl, entry, align, data);
+ if (ret)
+ goto out;
+
+ if (write) {
+ memcpy(data + offset, buf + i, cnt);
+ ret = sgx_encl_debug_write(encl, entry, align, data);
+ if (ret)
+ goto out;
+ } else {
+ memcpy(buf + i, data + offset, cnt);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+
+ if (ret)
+ break;
+ }
+
+ return ret < 0 ? ret : i;
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+ .fault = sgx_vma_fault,
+ .mprotect = sgx_vma_mprotect,
+ .open = sgx_vma_open,
+ .access = sgx_vma_access,
+};
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @ref: address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+ struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
+ struct sgx_va_page *va_page;
+ struct sgx_encl_page *entry;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
+
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
+ if (entry->epc_page) {
+ /*
+ * The page and its radix tree entry cannot be freed
+ * if the page is being held by the reclaimer.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page))
+ continue;
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ }
+
+ kfree(entry);
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+
+ xa_destroy(&encl->page_array);
+
+ if (!encl->secs_child_cnt && encl->secs.epc_page) {
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+ }
+
+ while (!list_empty(&encl->va_pages)) {
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ list_del(&va_page->list);
+ sgx_encl_free_epc_page(va_page->epc_page);
+ kfree(va_page);
+ }
+
+ if (encl->backing)
+ fput(encl->backing);
+
+ cleanup_srcu_struct(&encl->srcu);
+
+ WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+ /* Detect EPC page leak's. */
+ WARN_ON_ONCE(encl->secs_child_cnt);
+ WARN_ON_ONCE(encl->secs.epc_page);
+
+ kfree(encl);
+ sgx_dec_usage_count();
+}
+
+/*
+ * 'mm' is exiting and no longer needs mmu notifications.
+ */
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+ bool found = false;
+
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
+ spin_lock(&encl_mm->encl->mm_lock);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&encl_mm->encl->mm_lock);
+
+ if (found) {
+ synchronize_srcu(&encl_mm->encl->srcu);
+ mmu_notifier_put(mn);
+ }
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+ /* 'encl_mm' is going away, put encl_mm->encl reference: */
+ kref_put(&encl_mm->encl->refcount, sgx_encl_release);
+
+ kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+ .free_notifier = sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = NULL;
+ struct sgx_encl_mm *tmp;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+ if (tmp->mm == mm) {
+ encl_mm = tmp;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm;
+ int ret;
+
+ /*
+ * Even though a single enclave may be mapped into an mm more than once,
+ * each 'mm' only appears once on encl->mm_list. This is guaranteed by
+ * holding the mm's mmap lock for write before an mm can be added or
+ * remove to an encl->mm_list.
+ */
+ mmap_assert_write_locked(mm);
+
+ /*
+ * It's possible that an entry already exists in the mm_list, because it
+ * is removed only on VFS release or process exit.
+ */
+ if (sgx_encl_find_mm(encl, mm))
+ return 0;
+
+ encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+ if (!encl_mm)
+ return -ENOMEM;
+
+ /* Grab a refcount for the encl_mm->encl reference: */
+ kref_get(&encl->refcount);
+ encl_mm->encl = encl;
+ encl_mm->mm = mm;
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
+
+ spin_lock(&encl->mm_lock);
+ list_add_rcu(&encl_mm->list, &encl->mm_list);
+ /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
+ smp_wmb();
+ encl->mm_list_version++;
+ spin_unlock(&encl->mm_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
+ * @encl: the enclave
+ *
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. For example, ENCLS[EWB]
+ * copies a page from the enclave page cache to regular main memory but
+ * it fails if it cannot ensure that there are no cached
+ * linear-to-physical address mappings referring to the page.
+ *
+ * SGX hardware flushes all cached linear-to-physical mappings on a CPU
+ * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
+ * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
+ * address mappings are cleared but coordination with the tracking done within
+ * the SGX hardware is needed to support the SGX functions that depend on this
+ * cache clearing.
+ *
+ * When the ENCLS[ETRACK] function is issued on an enclave the hardware
+ * tracks threads operating inside the enclave at that time. The SGX
+ * hardware tracking require that all the identified threads must have
+ * exited the enclave in order to flush the mappings before a function such
+ * as ENCLS[EWB] will be permitted
+ *
+ * The following flow is used to support SGX functions that require that
+ * no cached linear-to-physical address mappings are present:
+ * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
+ * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
+ * accessing the enclave.
+ * 3) Send IPI to identified CPUs, kicking them out of the enclave and
+ * thus flushing all locally cached linear-to-physical address mappings.
+ * 4) Execute SGX function.
+ *
+ * Context: It is required to call this function after ENCLS[ETRACK].
+ * This will ensure that if any new mm appears (racing with
+ * sgx_encl_mm_add()) then the new mm will enter into the
+ * enclave with fresh linear-to-physical address mappings.
+ *
+ * It is required that all IPIs are completed before a new
+ * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
+ * of the above flow with the enclave's mutex.
+ *
+ * Return: cpumask of CPUs that might be accessing @encl
+ */
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
+static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
+ pgoff_t index)
+{
+ struct address_space *mapping = encl->backing->f_mapping;
+ gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+ return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+/**
+ * __sgx_encl_get_backing() - Pin the backing storage
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Pin the backing storage pages for storing the encrypted contents and Paging
+ * Crypto MetaData (PCMD) of an enclave page.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
+ struct page *contents;
+ struct page *pcmd;
+
+ contents = sgx_encl_get_backing_page(encl, page_index);
+ if (IS_ERR(contents))
+ return PTR_ERR(contents);
+
+ pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
+ if (IS_ERR(pcmd)) {
+ put_page(contents);
+ return PTR_ERR(pcmd);
+ }
+
+ backing->contents = contents;
+ backing->pcmd = pcmd;
+ backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
+
+ return 0;
+}
+
+/*
+ * When called from ksgxd, returns the mem_cgroup of a struct mm stored
+ * in the enclave's mm_list. When not called from ksgxd, just returns
+ * the mem_cgroup of the current task.
+ */
+static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * If called from normal task context, return the mem_cgroup
+ * of the current task's mm. The remainder of the handling is for
+ * ksgxd.
+ */
+ if (!current_is_ksgxd())
+ return get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * Search the enclave's mm_list to find an mm associated with
+ * this enclave to charge the allocation to.
+ */
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ memcg = get_mem_cgroup_from_mm(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ /*
+ * In the rare case that there isn't an mm associated with
+ * the enclave, set memcg to the current active mem_cgroup.
+ * This will be the root mem_cgroup if there is no active
+ * mem_cgroup.
+ */
+ if (!memcg)
+ return get_mem_cgroup_from_mm(NULL);
+
+ return memcg;
+}
+
+/**
+ * sgx_encl_alloc_backing() - create a new backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * When called from ksgxd, sets the active memcg from one of the
+ * mms in the enclave's mm_list prior to any backing page allocation,
+ * in order to ensure that shmem page allocations are charged to the
+ * enclave. Create a backing page for loading data back into an EPC page with
+ * ELDU. This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
+ struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
+ int ret;
+
+ ret = __sgx_encl_get_backing(encl, page_index, backing);
+
+ set_active_memcg(memcg);
+ mem_cgroup_put(encl_memcg);
+
+ return ret;
+}
+
+/**
+ * sgx_encl_lookup_backing() - retrieve an existing backing storage page
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Retrieve a backing page for loading data back into an EPC page with ELDU.
+ * It is the caller's responsibility to ensure that it is appropriate to use
+ * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
+ * not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ return __sgx_encl_get_backing(encl, page_index, backing);
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing)
+{
+ put_page(backing->pcmd);
+ put_page(backing->contents);
+}
+
+static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
+ void *data)
+{
+ pte_t pte;
+ int ret;
+
+ ret = pte_young(*ptep);
+ if (ret) {
+ pte = pte_mkold(*ptep);
+ set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+ }
+
+ return ret;
+}
+
+/**
+ * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
+ * @mm: mm_struct that is checked
+ * @page: enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the enclave page and
+ * clears it.
+ *
+ * Return: 1 if the page has been recently accessed and 0 if not.
+ */
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page)
+{
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = sgx_encl_find(mm, addr, &vma);
+ if (ret)
+ return 0;
+
+ if (encl != vma->vm_private_data)
+ return 0;
+
+ ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+ sgx_encl_test_and_clear_young_cb, vma->vm_mm);
+ if (ret < 0)
+ return 0;
+
+ return ret;
+}
+
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+/**
+ * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
+ * @encl: the enclave
+ * @addr: page aligned pointer to single page for which PTEs will be removed
+ *
+ * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
+ * @addr from each VMA. Ensure that page fault handler is ready to handle
+ * new mappings of @addr before calling this function.
+ */
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
+{
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+}
+
+/**
+ * sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ * @reclaim: Reclaim EPC pages directly if none available. Enclave
+ * mutex should not be held if this is set.
+ *
+ * Allocate a free EPC page and convert it to a Version Array (VA) page.
+ *
+ * Return:
+ * a VA page,
+ * -errno otherwise
+ */
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
+{
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(NULL, reclaim);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ ret = __epa(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
+ sgx_encl_free_epc_page(epc_page);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return epc_page;
+}
+
+/**
+ * sgx_alloc_va_slot - allocate a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Allocates a slot from a &struct sgx_va_page instance.
+ *
+ * Return: offset of the slot inside the VA page
+ */
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ if (slot < SGX_VA_SLOT_COUNT)
+ set_bit(slot, va_page->slots);
+
+ return slot << 3;
+}
+
+/**
+ * sgx_free_va_slot - free a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ * @offset: offset of the slot inside the VA page
+ *
+ * Frees a slot from a &struct sgx_va_page instance.
+ */
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
+{
+ clear_bit(offset >> 3, va_page->slots);
+}
+
+/**
+ * sgx_va_page_full - is the VA page full?
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Return: true if all slots have been taken
+ */
+bool sgx_va_page_full(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ return slot == SGX_VA_SLOT_COUNT;
+}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..8ff47f6652b9
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains the software defined data structures for enclaves.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/* 'desc' bits holding the offset in the VA (version array) page. */
+#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3)
+
+/* 'desc' bit marking that the page is being reclaimed. */
+#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3)
+
+struct sgx_encl_page {
+ unsigned long desc;
+ unsigned long vm_max_prot_bits:8;
+ enum sgx_page_type type:16;
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl *encl;
+ struct sgx_va_page *va_page;
+};
+
+enum sgx_encl_flags {
+ SGX_ENCL_IOCTL = BIT(0),
+ SGX_ENCL_DEBUG = BIT(1),
+ SGX_ENCL_CREATED = BIT(2),
+ SGX_ENCL_INITIALIZED = BIT(3),
+};
+
+struct sgx_encl_mm {
+ struct sgx_encl *encl;
+ struct mm_struct *mm;
+ struct list_head list;
+ struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+ unsigned long base;
+ unsigned long size;
+ unsigned long flags;
+ unsigned int page_cnt;
+ unsigned int secs_child_cnt;
+ struct mutex lock;
+ struct xarray page_array;
+ struct sgx_encl_page secs;
+ unsigned long attributes;
+ unsigned long attributes_mask;
+
+ cpumask_t cpumask;
+ struct file *backing;
+ struct kref refcount;
+ struct list_head va_pages;
+ unsigned long mm_list_version;
+ struct list_head mm_list;
+ spinlock_t mm_lock;
+ struct srcu_struct srcu;
+};
+
+#define SGX_VA_SLOT_COUNT 512
+
+struct sgx_va_page {
+ struct sgx_epc_page *epc_page;
+ DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT);
+ struct list_head list;
+};
+
+struct sgx_backing {
+ struct page *contents;
+ struct page *pcmd;
+ unsigned long pcmd_offset;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **vma)
+{
+ struct vm_area_struct *result;
+
+ result = vma_lookup(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops)
+ return -EINVAL;
+
+ *vma = result;
+
+ return 0;
+}
+
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, vm_flags_t vm_flags);
+
+bool current_is_ksgxd(void);
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
+int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing);
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page);
+struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags);
+void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr);
+struct sgx_epc_page *sgx_alloc_va_page(bool reclaim);
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
+bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
+struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr);
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim);
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
new file mode 100644
index 000000000000..74be751199a4
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_ENCLS_H
+#define _X86_ENCLS_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+/* Retrieve the encoded trapnr from the specified return code. */
+#define ENCLS_TRAPNR(r) ((r) & ~SGX_ENCLS_FAULT_FLAG)
+
+/* Issue a WARN() about an ENCLS function. */
+#define ENCLS_WARN(r, name) { \
+ do { \
+ int _r = (r); \
+ WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
+ } while (0); \
+}
+
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & SGX_ENCLS_FAULT_FLAG;
+}
+
+/**
+ * encls_failed() - Check if an ENCLS function failed
+ * @ret: the return value of an ENCLS function call
+ *
+ * Check if an ENCLS function failed. This happens when the function causes a
+ * fault that is not caused by an EPCM conflict or when the function returns a
+ * non-zero value.
+ */
+static inline bool encls_failed(int ret)
+{
+ if (encls_faulted(ret))
+ return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
+
+ return !!ret;
+}
+
+/**
+ * __encls_ret_N - encode an ENCLS function that returns an error code in EAX
+ * @rax: function number
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE.
+ * And because SGX isn't complex enough as it is, function that return an error
+ * code also modify flags.
+ *
+ * Return:
+ * 0 on success,
+ * SGX error code on failure
+ */
+#define __encls_ret_N(rax, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret) \
+ : "a"(rax), inputs \
+ : "memory", "cc"); \
+ ret; \
+ })
+
+#define __encls_ret_1(rax, rcx) \
+ ({ \
+ __encls_ret_N(rax, "c"(rcx)); \
+ })
+
+#define __encls_ret_2(rax, rbx, rcx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_ret_3(rax, rbx, rcx, rdx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \
+ })
+
+/**
+ * __encls_N - encode an ENCLS function that doesn't return an error code
+ * @rax: function number
+ * @rbx_out: optional output variable
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that does not return an error code, e.g.
+ * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an
+ * optional parameter for use by EDGBRD, which returns the requested value in
+ * RBX.
+ *
+ * Return:
+ * 0 on success,
+ * trapnr with SGX_ENCLS_FAULT_FLAG set on fault
+ */
+#define __encls_N(rax, rbx_out, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: encls\n\t" \
+ "xor %%eax,%%eax\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \
+ : "=a"(ret), "=b"(rbx_out) \
+ : "a"(rax), inputs \
+ : "memory"); \
+ ret; \
+ })
+
+#define __encls_2(rax, rbx, rcx) \
+ ({ \
+ unsigned long ign_rbx_out; \
+ __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_1_1(rax, data, rcx) \
+ ({ \
+ unsigned long rbx_out; \
+ int ret = __encls_N(rax, rbx_out, "c"(rcx)); \
+ if (!ret) \
+ data = rbx_out; \
+ ret; \
+ })
+
+/* Initialize an EPC page into an SGX Enclave Control Structure (SECS) page. */
+static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
+{
+ return __encls_2(ECREATE, pginfo, secs);
+}
+
+/* Hash a 256 byte region of an enclave page to SECS:MRENCLAVE. */
+static inline int __eextend(void *secs, void *addr)
+{
+ return __encls_2(EEXTEND, secs, addr);
+}
+
+/*
+ * Associate an EPC page to an enclave either as a REG or TCS page
+ * populated with the provided data.
+ */
+static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EADD, pginfo, addr);
+}
+
+/* Finalize enclave build, initialize enclave for user code execution. */
+static inline int __einit(void *sigstruct, void *token, void *secs)
+{
+ return __encls_ret_3(EINIT, sigstruct, secs, token);
+}
+
+/* Disassociate EPC page from its enclave and mark it as unused. */
+static inline int __eremove(void *addr)
+{
+ return __encls_ret_1(EREMOVE, addr);
+}
+
+/* Copy data to an EPC page belonging to a debug enclave. */
+static inline int __edbgwr(void *addr, unsigned long *data)
+{
+ return __encls_2(EDGBWR, *data, addr);
+}
+
+/* Copy data from an EPC page belonging to a debug enclave. */
+static inline int __edbgrd(void *addr, unsigned long *data)
+{
+ return __encls_1_1(EDGBRD, *data, addr);
+}
+
+/* Track that software has completed the required TLB address clears. */
+static inline int __etrack(void *addr)
+{
+ return __encls_ret_1(ETRACK, addr);
+}
+
+/* Load, verify, and unblock an EPC page. */
+static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(ELDU, pginfo, addr, va);
+}
+
+/* Make EPC page inaccessible to enclave, ready to be written to memory. */
+static inline int __eblock(void *addr)
+{
+ return __encls_ret_1(EBLOCK, addr);
+}
+
+/* Initialize an EPC page into a Version Array (VA) page. */
+static inline int __epa(void *addr)
+{
+ unsigned long rbx = SGX_PAGE_TYPE_VA;
+
+ return __encls_2(EPA, rbx, addr);
+}
+
+/* Invalidate an EPC page and write it out to main memory. */
+static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(EWB, pginfo, addr, va);
+}
+
+/* Restrict the EPCM permissions of an EPC page. */
+static inline int __emodpr(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODPR, secinfo, addr);
+}
+
+/* Change the type of an EPC page. */
+static inline int __emodt(struct sgx_secinfo *secinfo, void *addr)
+{
+ return __encls_ret_2(EMODT, secinfo, addr);
+}
+
+/* Zero a page of EPC memory and add it to an initialized enclave. */
+static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EAUG, pginfo, addr);
+}
+
+/* Attempt to update CPUSVN at runtime. */
+static inline int __eupdatesvn(void)
+{
+ return __encls_ret_1(EUPDATESVN, "");
+}
+#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..66f1efa16fbb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <asm/mman.h>
+#include <asm/sgx.h>
+#include <crypto/sha2.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl, bool reclaim)
+{
+ struct sgx_va_page *va_page = NULL;
+ void *err;
+
+ BUILD_BUG_ON(SGX_VA_SLOT_COUNT !=
+ (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1);
+
+ if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) {
+ va_page = kzalloc(sizeof(*va_page), GFP_KERNEL);
+ if (!va_page)
+ return ERR_PTR(-ENOMEM);
+
+ va_page->epc_page = sgx_alloc_va_page(reclaim);
+ if (IS_ERR(va_page->epc_page)) {
+ err = ERR_CAST(va_page->epc_page);
+ kfree(va_page);
+ return err;
+ }
+
+ WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT);
+ }
+ encl->page_cnt++;
+ return va_page;
+}
+
+void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+{
+ encl->page_cnt--;
+
+ if (va_page) {
+ sgx_encl_free_epc_page(va_page->epc_page);
+ list_del(&va_page->list);
+ kfree(va_page);
+ }
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+ struct sgx_epc_page *secs_epc;
+ struct sgx_va_page *va_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_secinfo secinfo;
+ unsigned long encl_size;
+ struct file *backing;
+ long ret;
+
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page))
+ return PTR_ERR(va_page);
+ else if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+ /* else the tail page of the VA page list had free slots. */
+
+ /* The extra page goes to SECS. */
+ encl_size = secs->size + PAGE_SIZE;
+
+ backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+ VM_NORESERVE);
+ if (IS_ERR(backing)) {
+ ret = PTR_ERR(backing);
+ goto err_out_shrink;
+ }
+
+ encl->backing = backing;
+
+ secs_epc = sgx_alloc_epc_page(&encl->secs, true);
+ if (IS_ERR(secs_epc)) {
+ ret = PTR_ERR(secs_epc);
+ goto err_out_backing;
+ }
+
+ encl->secs.epc_page = secs_epc;
+
+ pginfo.addr = 0;
+ pginfo.contents = (unsigned long)secs;
+ pginfo.metadata = (unsigned long)&secinfo;
+ pginfo.secs = 0;
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc));
+ if (ret) {
+ ret = -EIO;
+ goto err_out;
+ }
+
+ if (secs->attributes & SGX_ATTR_DEBUG)
+ set_bit(SGX_ENCL_DEBUG, &encl->flags);
+
+ encl->secs.encl = encl;
+ encl->secs.type = SGX_PAGE_TYPE_SECS;
+ encl->base = secs->base;
+ encl->size = secs->size;
+ encl->attributes = secs->attributes;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
+
+ /* Set only after completion, as encl->lock has not been taken. */
+ set_bit(SGX_ENCL_CREATED, &encl->flags);
+
+ return 0;
+
+err_out:
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+err_out_backing:
+ fput(encl->backing);
+ encl->backing = NULL;
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @encl: An enclave pointer.
+ * @arg: The ioctl argument.
+ *
+ * Allocate kernel data structures for the enclave and invoke ECREATE.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EIO: ECREATE failed.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_create create_arg;
+ void *secs;
+ int ret;
+
+ if (test_bit(SGX_ENCL_CREATED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&create_arg, arg, sizeof(create_arg)))
+ return -EFAULT;
+
+ secs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!secs)
+ return -ENOMEM;
+
+ if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
+ ret = -EFAULT;
+ else
+ ret = sgx_encl_create(encl, secs);
+
+ kfree(secs);
+ return ret;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+ u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+ u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+ if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+ return -EINVAL;
+
+ if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+ return -EINVAL;
+
+ /*
+ * CPU will silently overwrite the permissions as zero, which means
+ * that we need to validate it ourselves.
+ */
+ if (pt == SGX_SECINFO_TCS && perm)
+ return -EINVAL;
+
+ if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+ return -EINVAL;
+
+ if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+ struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_secinfo *secinfo, unsigned long src)
+{
+ struct sgx_pageinfo pginfo;
+ struct vm_area_struct *vma;
+ struct page *src_page;
+ int ret;
+
+ /* Deny noexec. */
+ vma = find_vma(current->mm, src);
+ if (!vma)
+ return -EFAULT;
+
+ if (!(vma->vm_flags & VM_MAYEXEC))
+ return -EACCES;
+
+ ret = get_user_pages(src, 1, 0, &src_page);
+ if (ret < 1)
+ return -EFAULT;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = (unsigned long)secinfo;
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
+
+ ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
+
+ kunmap_local((void *)pginfo.contents);
+ put_page(src_page);
+
+ return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+ struct sgx_epc_page *epc_page)
+{
+ unsigned long offset;
+ int ret;
+
+ for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) {
+ ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
+ sgx_get_epc_virt_addr(epc_page) + offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EEXTEND");
+
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+ unsigned long offset, struct sgx_secinfo *secinfo,
+ unsigned long flags)
+{
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ int ret;
+
+ encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+ if (IS_ERR(encl_page))
+ return PTR_ERR(encl_page);
+
+ epc_page = sgx_alloc_epc_page(encl_page, true);
+ if (IS_ERR(epc_page)) {
+ kfree(encl_page);
+ return PTR_ERR(epc_page);
+ }
+
+ va_page = sgx_encl_grow(encl, true);
+ if (IS_ERR(va_page)) {
+ ret = PTR_ERR(va_page);
+ goto err_out_free;
+ }
+
+ mmap_read_lock(current->mm);
+ mutex_lock(&encl->lock);
+
+ /*
+ * Adding to encl->va_pages must be done under encl->lock. Ditto for
+ * deleting (via sgx_encl_shrink()) in the error path.
+ */
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ /*
+ * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e.
+ * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+ * to userspace errors (or kernel/hardware bugs).
+ */
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ if (ret)
+ goto err_out_unlock;
+
+ ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+ src);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Complete the "add" before doing the "extend" so that the "add"
+ * isn't in a half-baked state in the extremely unlikely scenario
+ * the enclave will be destroyed in response to EEXTEND failure.
+ */
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl_page->type = (secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK) >> 8;
+ encl->secs_child_cnt++;
+
+ if (flags & SGX_PAGE_MEASURE) {
+ ret = __sgx_encl_extend(encl, epc_page);
+ if (ret)
+ goto err_out;
+ }
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+ return ret;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+ sgx_encl_shrink(encl, va_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+
+err_out_free:
+ sgx_encl_free_epc_page(epc_page);
+ kfree(encl_page);
+
+ return ret;
+}
+
+/*
+ * Ensure user provided offset and length values are valid for
+ * an enclave.
+ */
+static int sgx_validate_offset_length(struct sgx_encl *encl,
+ unsigned long offset,
+ unsigned long length)
+{
+ if (!IS_ALIGNED(offset, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!length || !IS_ALIGNED(length, PAGE_SIZE))
+ return -EINVAL;
+
+ if (offset + length < offset)
+ return -EINVAL;
+
+ if (offset + length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl: an enclave pointer
+ * @arg: a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * The function deinitializes kernel data structures for enclave and returns
+ * -EIO in any of the following conditions:
+ *
+ * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
+ * been invalidated. This will cause EADD and EEXTEND to fail.
+ * - If the source address is corrupted somehow when executing EADD.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EACCES: The source page is located in a noexec partition.
+ * - -ENOMEM: Out of EPC pages.
+ * - -EINTR: The call was interrupted before data was processed.
+ * - -EIO: Either EADD or EEXTEND failed because invalid source address
+ * or power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_add_pages add_arg;
+ struct sgx_secinfo secinfo;
+ unsigned long c;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(add_arg.src, PAGE_SIZE))
+ return -EINVAL;
+
+ if (sgx_validate_offset_length(encl, add_arg.offset, add_arg.length))
+ return -EINVAL;
+
+ if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
+ sizeof(secinfo)))
+ return -EFAULT;
+
+ if (sgx_validate_secinfo(&secinfo))
+ return -EINVAL;
+
+ for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) {
+ if (signal_pending(current)) {
+ if (!c)
+ ret = -ERESTARTSYS;
+
+ break;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+ ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
+ &secinfo, add_arg.flags);
+ if (ret)
+ break;
+ }
+
+ add_arg.count = c;
+
+ if (copy_to_user(arg, &add_arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+ void *token)
+{
+ u64 mrsigner[4];
+ int i, j;
+ void *addr;
+ int ret;
+
+ /*
+ * Deny initializing enclaves with attributes (namely provisioning)
+ * that have not been explicitly allowed.
+ */
+ if (encl->attributes & ~encl->attributes_mask)
+ return -EACCES;
+
+ /*
+ * Attributes should not be enforced *only* against what's available on
+ * platform (done in sgx_encl_create) but checked and enforced against
+ * the mask for enforcement in sigstruct. For example an enclave could
+ * opt to sign with AVX bit in xfrm, but still be loadable on a platform
+ * without it if the sigstruct->body.attributes_mask does not turn that
+ * bit on.
+ */
+ if (sigstruct->body.attributes & sigstruct->body.attributes_mask &
+ sgx_attributes_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.miscselect & sigstruct->body.misc_mask &
+ sgx_misc_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask &
+ sgx_xfrm_reserved_mask)
+ return -EINVAL;
+
+ sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner);
+
+ mutex_lock(&encl->lock);
+
+ /*
+ * ENCLS[EINIT] is interruptible because it has such a high latency,
+ * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending,
+ * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be
+ * serviced.
+ */
+ for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+ for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+ addr = sgx_get_epc_virt_addr(encl->secs.epc_page);
+
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(mrsigner);
+
+ ret = __einit(sigstruct, token, addr);
+
+ preempt_enable();
+
+ if (ret == SGX_UNMASKED_EVENT)
+ continue;
+ else
+ break;
+ }
+
+ if (ret != SGX_UNMASKED_EVENT)
+ break;
+
+ msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto err_out;
+ }
+ }
+
+ if (encls_faulted(ret)) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EINIT");
+
+ ret = -EIO;
+ } else if (ret) {
+ pr_debug("EINIT returned %d\n", ret);
+ ret = -EPERM;
+ } else {
+ set_bit(SGX_ENCL_INITIALIZED, &encl->flags);
+ }
+
+err_out:
+ mutex_unlock(&encl->lock);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT. The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EPERM: Invalid SIGSTRUCT.
+ * - -EIO: EINIT failed because of a power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_sigstruct *sigstruct;
+ struct sgx_enclave_init init_arg;
+ void *token;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
+ return -EFAULT;
+
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
+ return -ENOMEM;
+
+ token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
+ memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
+
+ if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct,
+ sizeof(*sigstruct))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * A legacy field used with Intel signed enclaves. These used to mean
+ * regular and architectural enclaves. The CPU only accepts these values
+ * but they do not have any other meaning.
+ *
+ * Thus, reject any other values.
+ */
+ if (sigstruct->header.vendor != 0x0000 &&
+ sigstruct->header.vendor != 0x8086) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sgx_encl_init(encl, sigstruct, token);
+
+out:
+ kfree(sigstruct);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_provision instance
+ *
+ * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to
+ * /dev/sgx_provision.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_provision params;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
+}
+
+/*
+ * Ensure enclave is ready for SGX2 functions. Readiness is checked
+ * by ensuring the hardware supports SGX2 and the enclave is initialized
+ * and thus able to handle requests to modify pages within it.
+ */
+static int sgx_ioc_sgx2_ready(struct sgx_encl *encl)
+{
+ if (!(cpu_feature_enabled(X86_FEATURE_SGX2)))
+ return -ENODEV;
+
+ if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Some SGX functions require that no cached linear-to-physical address
+ * mappings are present before they can succeed. Collaborate with
+ * hardware via ENCLS[ETRACK] to ensure that all cached
+ * linear-to-physical address mappings belonging to all threads of
+ * the enclave are cleared. See sgx_encl_cpumask() for details.
+ *
+ * Must be called with enclave's mutex held from the time the
+ * SGX function requiring that no cached linear-to-physical mappings
+ * are present is executed until this ETRACK flow is complete.
+ */
+static int sgx_enclave_etrack(struct sgx_encl *encl)
+{
+ void *epc_virt;
+ int ret;
+
+ epc_virt = sgx_get_epc_virt_addr(encl->secs.epc_page);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ /*
+ * ETRACK only fails when there is an OS issue. For
+ * example, two consecutive ETRACK was sent without
+ * completed IPI between.
+ */
+ pr_err_once("ETRACK returned %d (0x%x)", ret, ret);
+ /*
+ * Send IPIs to kick CPUs out of the enclave and
+ * try ETRACK again.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+ ret = __etrack(epc_virt);
+ if (ret) {
+ pr_err_once("ETRACK repeat returned %d (0x%x)",
+ ret, ret);
+ return -EFAULT;
+ }
+ }
+ on_each_cpu_mask(sgx_encl_cpumask(encl), sgx_ipi_cb, NULL, 1);
+
+ return 0;
+}
+
+/**
+ * sgx_enclave_restrict_permissions() - Restrict EPCM permissions
+ * @encl: Enclave to which the pages belong.
+ * @modp: Checked parameters from user on which pages need modifying and
+ * their new permissions.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long
+sgx_enclave_restrict_permissions(struct sgx_encl *encl,
+ struct sgx_enclave_restrict_permissions *modp)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = modp->permissions & SGX_SECINFO_PERMISSION_MASK;
+
+ for (c = 0 ; c < modp->length; c += PAGE_SIZE) {
+ addr = encl->base + modp->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Changing EPCM permissions is only supported on regular
+ * SGX pages. Attempting this change on other pages will
+ * result in #PF.
+ */
+ if (entry->type != SGX_PAGE_TYPE_REG) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * Apart from ensuring that read-access remains, do not verify
+ * the permission bits requested. Kernel has no control over
+ * how EPCM permissions can be relaxed from within the enclave.
+ * ENCLS[EMODPR] can only remove existing EPCM permissions,
+ * attempting to set new permissions will be ignored by the
+ * hardware.
+ */
+
+ /* Change EPCM permissions. */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * permissions of a regular page, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these
+ * are protected with mutex.
+ */
+ pr_err_once("EMODPR encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+ if (encls_failed(ret)) {
+ modp->result = ret;
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modp->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_restrict_permissions() - handler for
+ * %SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_restrict_permissions
+ * instance
+ *
+ * SGX2 distinguishes between relaxing and restricting the enclave page
+ * permissions maintained by the hardware (EPCM permissions) of pages
+ * belonging to an initialized enclave (after SGX_IOC_ENCLAVE_INIT).
+ *
+ * EPCM permissions cannot be restricted from within the enclave, the enclave
+ * requires the kernel to run the privileged level 0 instructions ENCLS[EMODPR]
+ * and ENCLS[ETRACK]. An attempt to relax EPCM permissions with this call
+ * will be ignored by the hardware.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_restrict_permissions(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_restrict_permissions params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.permissions & ~SGX_SECINFO_PERMISSION_MASK)
+ return -EINVAL;
+
+ /*
+ * Fail early if invalid permissions requested to prevent ENCLS[EMODPR]
+ * from faulting later when the CPU does the same check.
+ */
+ if ((params.permissions & SGX_SECINFO_W) &&
+ !(params.permissions & SGX_SECINFO_R))
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_restrict_permissions(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_enclave_modify_types() - Modify type of SGX enclave pages
+ * @encl: Enclave to which the pages belong.
+ * @modt: Checked parameters from user about which pages need modifying
+ * and their new page type.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_enclave_modify_types(struct sgx_encl *encl,
+ struct sgx_enclave_modify_types *modt)
+{
+ unsigned long max_prot_restore;
+ enum sgx_page_type page_type;
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long prot;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ page_type = modt->page_type & SGX_PAGE_TYPE_MASK;
+
+ /*
+ * The only new page types allowed by hardware are PT_TCS and PT_TRIM.
+ */
+ if (page_type != SGX_PAGE_TYPE_TCS && page_type != SGX_PAGE_TYPE_TRIM)
+ return -EINVAL;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ secinfo.flags = page_type << 8;
+
+ for (c = 0 ; c < modt->length; c += PAGE_SIZE) {
+ addr = encl->base + modt->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ /*
+ * Borrow the logic from the Intel SDM. Regular pages
+ * (SGX_PAGE_TYPE_REG) can change type to SGX_PAGE_TYPE_TCS
+ * or SGX_PAGE_TYPE_TRIM but TCS pages can only be trimmed.
+ * CET pages not supported yet.
+ */
+ if (!(entry->type == SGX_PAGE_TYPE_REG ||
+ (entry->type == SGX_PAGE_TYPE_TCS &&
+ page_type == SGX_PAGE_TYPE_TRIM))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ max_prot_restore = entry->vm_max_prot_bits;
+
+ /*
+ * Once a regular page becomes a TCS page it cannot be
+ * changed back. So the maximum allowed protection reflects
+ * the TCS page that is always RW from kernel perspective but
+ * will be inaccessible from within enclave. Before doing
+ * so, do make sure that the new page type continues to
+ * respect the originally vetted page permissions.
+ */
+ if (entry->type == SGX_PAGE_TYPE_REG &&
+ page_type == SGX_PAGE_TYPE_TCS) {
+ if (~entry->vm_max_prot_bits & (VM_READ | VM_WRITE)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ prot = PROT_READ | PROT_WRITE;
+ entry->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ /*
+ * Prevent page from being reclaimed while mutex
+ * is released.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EAGAIN;
+ goto out_entry_changed;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_mark_page_reclaimable(entry->epc_page);
+ }
+
+ /* Change EPC type */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodt(&secinfo, epc_virt);
+ if (encls_faulted(ret)) {
+ /*
+ * All possible faults should be avoidable:
+ * parameters have been checked, will only change
+ * valid page types, and no concurrent
+ * SGX1/SGX2 ENCLS instructions since these are
+ * protected with mutex.
+ */
+ pr_err_once("EMODT encountered exception %d\n",
+ ENCLS_TRAPNR(ret));
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+ if (encls_failed(ret)) {
+ modt->result = ret;
+ ret = -EFAULT;
+ goto out_entry_changed;
+ }
+
+ ret = sgx_enclave_etrack(encl);
+ if (ret) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ entry->type = page_type;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_entry_changed:
+ entry->vm_max_prot_bits = max_prot_restore;
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ modt->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_modify_types() - handler for %SGX_IOC_ENCLAVE_MODIFY_TYPES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a &struct sgx_enclave_modify_types instance
+ *
+ * Ability to change the enclave page type supports the following use cases:
+ *
+ * * It is possible to add TCS pages to an enclave by changing the type of
+ * regular pages (%SGX_PAGE_TYPE_REG) to TCS (%SGX_PAGE_TYPE_TCS) pages.
+ * With this support the number of threads supported by an initialized
+ * enclave can be increased dynamically.
+ *
+ * * Regular or TCS pages can dynamically be removed from an initialized
+ * enclave by changing the page type to %SGX_PAGE_TYPE_TRIM. Changing the
+ * page type to %SGX_PAGE_TYPE_TRIM marks the page for removal with actual
+ * removal done by handler of %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() called
+ * after ENCLU[EACCEPT] is run on %SGX_PAGE_TYPE_TRIM page from within the
+ * enclave.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_modify_types(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_modify_types params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.page_type & ~SGX_PAGE_TYPE_MASK)
+ return -EINVAL;
+
+ if (params.result || params.count)
+ return -EINVAL;
+
+ ret = sgx_enclave_modify_types(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+/**
+ * sgx_encl_remove_pages() - Remove trimmed pages from SGX enclave
+ * @encl: Enclave to which the pages belong
+ * @params: Checked parameters from user on which pages need to be removed
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_encl_remove_pages(struct sgx_encl *encl,
+ struct sgx_enclave_remove_pages *params)
+{
+ struct sgx_encl_page *entry;
+ struct sgx_secinfo secinfo;
+ unsigned long addr;
+ unsigned long c;
+ void *epc_virt;
+ int ret;
+
+ memset(&secinfo, 0, sizeof(secinfo));
+ secinfo.flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
+
+ for (c = 0 ; c < params->length; c += PAGE_SIZE) {
+ addr = encl->base + params->offset + c;
+
+ sgx_reclaim_direct();
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry) == -EBUSY ? -EAGAIN : -EFAULT;
+ goto out_unlock;
+ }
+
+ if (entry->type != SGX_PAGE_TYPE_TRIM) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ /*
+ * ENCLS[EMODPR] is a no-op instruction used to inform if
+ * ENCLU[EACCEPT] was run from within the enclave. If
+ * ENCLS[EMODPR] is run with RWX on a trimmed page that is
+ * not yet accepted then it will return
+ * %SGX_PAGE_NOT_MODIFIABLE, after the trimmed page is
+ * accepted the instruction will encounter a page fault.
+ */
+ epc_virt = sgx_get_epc_virt_addr(entry->epc_page);
+ ret = __emodpr(&secinfo, epc_virt);
+ if (!encls_faulted(ret) || ENCLS_TRAPNR(ret) != X86_TRAP_PF) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (sgx_unmark_page_reclaimable(entry->epc_page)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Do not keep encl->lock because of dependency on
+ * mmap_lock acquired in sgx_zap_enclave_ptes().
+ */
+ mutex_unlock(&encl->lock);
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ xa_erase(&encl->page_array, PFN_DOWN(entry->desc));
+ sgx_encl_shrink(encl, NULL);
+ kfree(entry);
+
+ mutex_unlock(&encl->lock);
+ }
+
+ ret = 0;
+ goto out;
+
+out_unlock:
+ mutex_unlock(&encl->lock);
+out:
+ params->count = c;
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_remove_pages() - handler for %SGX_IOC_ENCLAVE_REMOVE_PAGES
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to &struct sgx_enclave_remove_pages instance
+ *
+ * Final step of the flow removing pages from an initialized enclave. The
+ * complete flow is:
+ *
+ * 1) User changes the type of the pages to be removed to %SGX_PAGE_TYPE_TRIM
+ * using the %SGX_IOC_ENCLAVE_MODIFY_TYPES ioctl().
+ * 2) User approves the page removal by running ENCLU[EACCEPT] from within
+ * the enclave.
+ * 3) User initiates actual page removal using the
+ * %SGX_IOC_ENCLAVE_REMOVE_PAGES ioctl() that is handled here.
+ *
+ * First remove any page table entries pointing to the page and then proceed
+ * with the actual removal of the enclave page and data in support of it.
+ *
+ * VA pages are not affected by this removal. It is thus possible that the
+ * enclave may end up with more VA pages than needed to support all its
+ * pages.
+ *
+ * Return:
+ * - 0: Success
+ * - -errno: Otherwise
+ */
+static long sgx_ioc_enclave_remove_pages(struct sgx_encl *encl,
+ void __user *arg)
+{
+ struct sgx_enclave_remove_pages params;
+ long ret;
+
+ ret = sgx_ioc_sgx2_ready(encl);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (sgx_validate_offset_length(encl, params.offset, params.length))
+ return -EINVAL;
+
+ if (params.count)
+ return -EINVAL;
+
+ ret = sgx_encl_remove_pages(encl, &params);
+
+ if (copy_to_user(arg, &params, sizeof(params)))
+ return -EFAULT;
+
+ return ret;
+}
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct sgx_encl *encl = filep->private_data;
+ int ret;
+
+ if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags))
+ return -EBUSY;
+
+ switch (cmd) {
+ case SGX_IOC_ENCLAVE_CREATE:
+ ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_ADD_PAGES:
+ ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_INIT:
+ ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_PROVISION:
+ ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_RESTRICT_PERMISSIONS:
+ ret = sgx_ioc_enclave_restrict_permissions(encl,
+ (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_MODIFY_TYPES:
+ ret = sgx_ioc_enclave_modify_types(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_REMOVE_PAGES:
+ ret = sgx_ioc_enclave_remove_pages(encl, (void __user *)arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ clear_bit(SGX_ENCL_IOCTL, &encl->flags);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..dc73194416ac
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/node.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#include <asm/msr.h>
+#include <asm/sgx.h>
+#include <asm/archrandom.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+static int sgx_nr_epc_sections;
+static struct task_struct *ksgxd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+static DEFINE_XARRAY(sgx_epc_address_space);
+
+/*
+ * These variables are part of the state of the reclaimer, and must be accessed
+ * with sgx_reclaimer_lock acquired.
+ */
+static LIST_HEAD(sgx_active_page_list);
+static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+
+static atomic_long_t sgx_nr_free_pages = ATOMIC_LONG_INIT(0);
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
+/*
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
+ *
+ * Return 0 when sanitization was successful or kthread was stopped, and the
+ * number of unsanitized pages otherwise.
+ */
+static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
+{
+ unsigned long left_dirty = 0;
+ struct sgx_epc_page *page;
+ LIST_HEAD(dirty);
+ int ret;
+
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
+ if (kthread_should_stop())
+ return 0;
+
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
+
+ /*
+ * Checking page->poison without holding the node->lock
+ * is racy, but losing the race (i.e. poison is set just
+ * after the check) just means __eremove() will be uselessly
+ * called for a page that sgx_free_epc_page() will put onto
+ * the node->sgx_poison_page_list later.
+ */
+ if (page->poison) {
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+ list_move(&page->list, &node->sgx_poison_page_list);
+ spin_unlock(&node->lock);
+
+ continue;
+ }
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
+ list_move_tail(&page->list, &dirty);
+ left_dirty++;
+ }
+
+ cond_resched();
+ }
+
+ list_splice(&dirty, dirty_page_list);
+ return left_dirty;
+}
+
+static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ struct sgx_encl *encl = page->encl;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = true;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+ ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ if (!ret)
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ int ret;
+
+ sgx_zap_enclave_ptes(encl, addr);
+
+ mutex_lock(&encl->lock);
+
+ ret = __eblock(sgx_get_epc_virt_addr(epc_page));
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EBLOCK");
+
+ mutex_unlock(&encl->lock);
+}
+
+static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
+ struct sgx_backing *backing)
+{
+ struct sgx_pageinfo pginfo;
+ int ret;
+
+ pginfo.addr = 0;
+ pginfo.secs = 0;
+
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
+ backing->pcmd_offset;
+
+ ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
+
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
+ backing->pcmd_offset));
+ kunmap_local((void *)(unsigned long)pginfo.contents);
+
+ return ret;
+}
+
+void sgx_ipi_cb(void *info)
+{
+}
+
+/*
+ * Swap page to the regular memory transformed to the blocked state by using
+ * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
+ *
+ * The first trial just tries to write the page assuming that some other thread
+ * has reset the count for threads inside the enclave by using ETRACK, and
+ * previous thread count has been zeroed out. The second trial calls ETRACK
+ * before EWB. If that fails we kick all the HW threads out, and then do EWB,
+ * which should be guaranteed the succeed.
+ */
+static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_va_page *va_page;
+ unsigned int va_offset;
+ void *va_slot;
+ int ret;
+
+ encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
+
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ va_offset = sgx_alloc_va_slot(va_page);
+ va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
+ if (sgx_va_page_full(va_page))
+ list_move_tail(&va_page->list, &encl->va_pages);
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ETRACK");
+ }
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ /*
+ * Slow path, send IPIs to kick cpus out of the
+ * enclave. Note, it's imperative that the cpu
+ * mask is generated *after* ETRACK, else we'll
+ * miss cpus that entered the enclave between
+ * generating the mask and incrementing epoch.
+ */
+ on_each_cpu_mask(sgx_encl_cpumask(encl),
+ sgx_ipi_cb, NULL, 1);
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ }
+ }
+
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EWB");
+
+ sgx_free_va_slot(va_page, va_offset);
+ } else {
+ encl_page->desc |= va_offset;
+ encl_page->va_page = va_page;
+ }
+}
+
+static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_backing secs_backing;
+ int ret;
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_ewb(epc_page, backing);
+ encl_page->epc_page = NULL;
+ encl->secs_child_cnt--;
+ sgx_encl_put_backing(backing);
+
+ if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
+ ret = sgx_encl_alloc_backing(encl, PFN_DOWN(encl->size),
+ &secs_backing);
+ if (ret)
+ goto out;
+
+ sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
+
+ sgx_encl_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+ sgx_encl_put_backing(&secs_backing);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+}
+
+/*
+ * Take a fixed number of pages from the head of the active page pool and
+ * reclaim them to the enclave's private shmem files. Skip the pages, which have
+ * been accessed since the last scan. Move those pages to the tail of active
+ * page pool so that the pages get scanned in LRU like fashion.
+ *
+ * Batch process a chunk of pages (at the moment 16) in order to degrade amount
+ * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
+ * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
+ * + EWB) but not sufficiently. Reclaiming one page at a time would also be
+ * problematic as it would increase the lock contention too much, which would
+ * halt forward progress.
+ */
+static void sgx_reclaim_pages(void)
+{
+ struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
+ struct sgx_backing backing[SGX_NR_TO_SCAN];
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ pgoff_t page_index;
+ int cnt = 0;
+ int ret;
+ int i;
+
+ spin_lock(&sgx_reclaimer_lock);
+ for (i = 0; i < SGX_NR_TO_SCAN; i++) {
+ if (list_empty(&sgx_active_page_list))
+ break;
+
+ epc_page = list_first_entry(&sgx_active_page_list,
+ struct sgx_epc_page, list);
+ list_del_init(&epc_page->list);
+ encl_page = epc_page->owner;
+
+ if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
+ chunk[cnt++] = epc_page;
+ else
+ /* The owner is freeing the page. No need to add the
+ * page back to the list of reclaimable pages.
+ */
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ encl_page = epc_page->owner;
+
+ if (!sgx_reclaimer_age(epc_page))
+ goto skip;
+
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+
+ mutex_lock(&encl_page->encl->lock);
+ ret = sgx_encl_alloc_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret) {
+ mutex_unlock(&encl_page->encl->lock);
+ goto skip;
+ }
+
+ encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
+ mutex_unlock(&encl_page->encl->lock);
+ continue;
+
+skip:
+ spin_lock(&sgx_reclaimer_lock);
+ list_add_tail(&epc_page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+
+ chunk[i] = NULL;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (epc_page)
+ sgx_reclaimer_block(epc_page);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (!epc_page)
+ continue;
+
+ encl_page = epc_page->owner;
+ sgx_reclaimer_write(epc_page, &backing[i]);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+
+ sgx_free_epc_page(epc_page);
+ }
+}
+
+static bool sgx_should_reclaim(unsigned long watermark)
+{
+ return atomic_long_read(&sgx_nr_free_pages) < watermark &&
+ !list_empty(&sgx_active_page_list);
+}
+
+/*
+ * sgx_reclaim_direct() should be called (without enclave's mutex held)
+ * in locations where SGX memory resources might be low and might be
+ * needed in order to make forward progress.
+ */
+void sgx_reclaim_direct(void)
+{
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ sgx_reclaim_pages();
+}
+
+static int ksgxd(void *p)
+{
+ set_freezable();
+
+ /*
+ * Sanitize pages in order to recover from kexec(). The 2nd pass is
+ * required for SECS pages, whose child pages blocked EREMOVE.
+ */
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_freezable(ksgxd_waitq,
+ kthread_should_stop() ||
+ sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+
+ if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages();
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool __init sgx_page_reclaimer_init(void)
+{
+ struct task_struct *tsk;
+
+ tsk = kthread_run(ksgxd, NULL, "ksgxd");
+ if (IS_ERR(tsk))
+ return false;
+
+ ksgxd_tsk = tsk;
+
+ return true;
+}
+
+bool current_is_ksgxd(void)
+{
+ return current == ksgxd_tsk;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
+{
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
+
+ spin_lock(&node->lock);
+
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
+ return NULL;
+ }
+
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
+ list_del_init(&page->list);
+ page->flags = 0;
+
+ spin_unlock(&node->lock);
+ atomic_long_dec(&sgx_nr_free_pages);
+
+ return page;
+}
+
+/**
+ * __sgx_alloc_epc_page() - Allocate an EPC page
+ *
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
+ *
+ * Return:
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
+ */
+struct sgx_epc_page *__sgx_alloc_epc_page(void)
+{
+ struct sgx_epc_page *page;
+ int nid_of_current = numa_node_id();
+ int nid_start, nid;
+
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+
+ nid = nid_start;
+ do {
+ page = __sgx_alloc_epc_page_from_node(nid);
+ if (page)
+ return page;
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * @page: EPC page
+ *
+ * Mark a page as reclaimable and add it to the active page list. Pages
+ * are automatically removed from the active list when freed.
+ */
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ list_add_tail(&page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+}
+
+/**
+ * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
+ * @page: EPC page
+ *
+ * Clear the reclaimable flag and remove the page from the active page list.
+ *
+ * Return:
+ * 0 on success,
+ * -EBUSY if the page is in the process of being reclaimed
+ */
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
+ /* The page is being reclaimed. */
+ if (list_empty(&page->list)) {
+ spin_unlock(&sgx_reclaimer_lock);
+ return -EBUSY;
+ }
+
+ list_del(&page->list);
+ page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_alloc_epc_page() - Allocate an EPC page
+ * @owner: the owner of the EPC page
+ * @reclaim: reclaim pages if necessary
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page(). If
+ * @reclaim is set to true, directly reclaim pages when we are out of pages. No
+ * mm's can be locked when @reclaim is set to true.
+ *
+ * Finally, wake up ksgxd when the number of pages goes below the watermark
+ * before returning back to the caller.
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
+{
+ struct sgx_epc_page *page;
+
+ for ( ; ; ) {
+ page = __sgx_alloc_epc_page();
+ if (!IS_ERR(page)) {
+ page->owner = owner;
+ break;
+ }
+
+ if (list_empty(&sgx_active_page_list))
+ return ERR_PTR(-ENOMEM);
+
+ if (!reclaim) {
+ page = ERR_PTR(-EBUSY);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ page = ERR_PTR(-ERESTARTSYS);
+ break;
+ }
+
+ sgx_reclaim_pages();
+ cond_resched();
+ }
+
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ wake_up(&ksgxd_waitq);
+
+ return page;
+}
+
+/**
+ * sgx_free_epc_page() - Free an EPC page
+ * @page: an EPC page
+ *
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
+ */
+void sgx_free_epc_page(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ struct sgx_numa_node *node = section->node;
+
+ spin_lock(&node->lock);
+
+ page->owner = NULL;
+ if (page->poison)
+ list_add(&page->list, &node->sgx_poison_page_list);
+ else
+ list_add_tail(&page->list, &node->free_page_list);
+ page->flags = SGX_EPC_PAGE_IS_FREE;
+
+ spin_unlock(&node->lock);
+ atomic_long_inc(&sgx_nr_free_pages);
+}
+
+static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
+ unsigned long index,
+ struct sgx_epc_section *section)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long i;
+
+ section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
+ if (!section->virt_addr)
+ return false;
+
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
+ if (!section->pages) {
+ memunmap(section->virt_addr);
+ return false;
+ }
+
+ section->phys_addr = phys_addr;
+ xa_store_range(&sgx_epc_address_space, section->phys_addr,
+ phys_addr + size - 1, section, GFP_KERNEL);
+
+ for (i = 0; i < nr_pages; i++) {
+ section->pages[i].section = index;
+ section->pages[i].flags = 0;
+ section->pages[i].owner = NULL;
+ section->pages[i].poison = 0;
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
+ }
+
+ return true;
+}
+
+bool arch_is_platform_page(u64 paddr)
+{
+ return !!xa_load(&sgx_epc_address_space, paddr);
+}
+EXPORT_SYMBOL_GPL(arch_is_platform_page);
+
+static struct sgx_epc_page *sgx_paddr_to_page(u64 paddr)
+{
+ struct sgx_epc_section *section;
+
+ section = xa_load(&sgx_epc_address_space, paddr);
+ if (!section)
+ return NULL;
+
+ return &section->pages[PFN_DOWN(paddr - section->phys_addr)];
+}
+
+/*
+ * Called in process context to handle a hardware reported
+ * error in an SGX EPC page.
+ * If the MF_ACTION_REQUIRED bit is set in flags, then the
+ * context is the task that consumed the poison data. Otherwise
+ * this is called from a kernel thread unrelated to the page.
+ */
+int arch_memory_failure(unsigned long pfn, int flags)
+{
+ struct sgx_epc_page *page = sgx_paddr_to_page(pfn << PAGE_SHIFT);
+ struct sgx_epc_section *section;
+ struct sgx_numa_node *node;
+
+ /*
+ * mm/memory-failure.c calls this routine for all errors
+ * where there isn't a "struct page" for the address. But that
+ * includes other address ranges besides SGX.
+ */
+ if (!page)
+ return -ENXIO;
+
+ /*
+ * If poison was consumed synchronously. Send a SIGBUS to
+ * the task. Hardware has already exited the SGX enclave and
+ * will not allow re-entry to an enclave that has a memory
+ * error. The signal may help the task understand why the
+ * enclave is broken.
+ */
+ if (flags & MF_ACTION_REQUIRED)
+ force_sig(SIGBUS);
+
+ section = &sgx_epc_sections[page->section];
+ node = section->node;
+
+ spin_lock(&node->lock);
+
+ /* Already poisoned? Nothing more to do */
+ if (page->poison)
+ goto out;
+
+ page->poison = 1;
+
+ /*
+ * If the page is on a free list, move it to the per-node
+ * poison page list.
+ */
+ if (page->flags & SGX_EPC_PAGE_IS_FREE) {
+ list_move(&page->list, &node->sgx_poison_page_list);
+ goto out;
+ }
+
+ sgx_unmark_page_reclaimable(page);
+
+ /*
+ * TBD: Add additional plumbing to enable pre-emptive
+ * action for asynchronous poison notification. Until
+ * then just hope that the poison:
+ * a) is not accessed - sgx_free_epc_page() will deal with it
+ * when the user gives it back
+ * b) results in a recoverable machine check rather than
+ * a fatal one
+ */
+out:
+ spin_unlock(&node->lock);
+ return 0;
+}
+
+/*
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+ return (low & GENMASK_ULL(31, 12)) +
+ ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+#ifdef CONFIG_NUMA
+static ssize_t sgx_total_bytes_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%lu\n", sgx_numa_nodes[dev->id].size);
+}
+static DEVICE_ATTR_RO(sgx_total_bytes);
+
+static umode_t arch_node_attr_is_visible(struct kobject *kobj,
+ struct attribute *attr, int idx)
+{
+ /* Make all x86/ attributes invisible when SGX is not initialized: */
+ if (nodes_empty(sgx_numa_mask))
+ return 0;
+
+ return attr->mode;
+}
+
+static struct attribute *arch_node_dev_attrs[] = {
+ &dev_attr_sgx_total_bytes.attr,
+ NULL,
+};
+
+const struct attribute_group arch_node_dev_group = {
+ .name = "x86",
+ .attrs = arch_node_dev_attrs,
+ .is_visible = arch_node_attr_is_visible,
+};
+
+static void __init arch_update_sysfs_visibility(int nid)
+{
+ struct node *node = node_devices[nid];
+ int ret;
+
+ ret = sysfs_update_group(&node->dev.kobj, &arch_node_dev_group);
+
+ if (ret)
+ pr_err("sysfs update failed (%d), files may be invisible", ret);
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_update_sysfs_visibility(int nid) {}
+#endif
+
+static bool __init sgx_page_cache_init(void)
+{
+ u32 eax, ebx, ecx, edx, type;
+ u64 pa, size;
+ int nid;
+ int i;
+
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+ cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID)
+ break;
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ pr_err_once("Unknown EPC section type: %u\n", type);
+ break;
+ }
+
+ pa = sgx_calc_section_metric(eax, ebx);
+ size = sgx_calc_section_metric(ecx, edx);
+
+ pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+ if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+ pr_err("No free memory for an EPC section\n");
+ break;
+ }
+
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].sgx_poison_page_list);
+ node_set(nid, sgx_numa_mask);
+ sgx_numa_nodes[nid].size = 0;
+
+ /* Make SGX-specific node sysfs files visible: */
+ arch_update_sysfs_visibility(nid);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+ sgx_numa_nodes[nid].size += size;
+
+ sgx_nr_epc_sections++;
+ }
+
+ if (!sgx_nr_epc_sections) {
+ pr_err("There are zero EPC sections.\n");
+ return false;
+ }
+
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
+ return true;
+}
+
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ CLASS(fd, f)(attribute_fd);
+
+ if (fd_empty(f))
+ return -EINVAL;
+
+ if (fd_file(f)->f_op != &sgx_provision_fops)
+ return -EINVAL;
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute);
+
+/* Counter to count the active SGX users */
+static int sgx_usage_count;
+
+/**
+ * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN].
+ *
+ * This instruction attempts to update CPUSVN to the
+ * currently loaded microcode update SVN and generate new
+ * cryptographic assets.
+ *
+ * Return:
+ * * %0: - Success or not supported
+ * * %-EAGAIN: - Can be safely retried, failure is due to lack of
+ * * entropy in RNG
+ * * %-EIO: - Unexpected error, retries are not advisable
+ */
+static int sgx_update_svn(void)
+{
+ int ret;
+
+ /*
+ * If EUPDATESVN is not available, it is ok to
+ * silently skip it to comply with legacy behavior.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN))
+ return 0;
+
+ /*
+ * EPC is guaranteed to be empty when there are no users.
+ * Ensure we are on our first user before proceeding further.
+ */
+ WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n");
+
+ for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) {
+ ret = __eupdatesvn();
+
+ /* Stop on success or unexpected errors: */
+ if (ret != SGX_INSUFFICIENT_ENTROPY)
+ break;
+ }
+
+ switch (ret) {
+ case 0:
+ /*
+ * SVN successfully updated.
+ * Let users know when the update was successful.
+ */
+ pr_info("SVN updated successfully\n");
+ return 0;
+ case SGX_NO_UPDATE:
+ /*
+ * SVN update failed since the current SVN is
+ * not newer than CPUSVN. This is the most
+ * common case and indicates no harm.
+ */
+ return 0;
+ case SGX_INSUFFICIENT_ENTROPY:
+ /*
+ * SVN update failed due to lack of entropy in DRNG.
+ * Indicate to userspace that it should retry.
+ */
+ return -EAGAIN;
+ default:
+ break;
+ }
+
+ /*
+ * EUPDATESVN was called when EPC is empty, all other error
+ * codes are unexpected.
+ */
+ ENCLS_WARN(ret, "EUPDATESVN");
+ return -EIO;
+}
+
+/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */
+static DEFINE_MUTEX(sgx_svn_lock);
+
+int sgx_inc_usage_count(void)
+{
+ int ret;
+
+ guard(mutex)(&sgx_svn_lock);
+
+ if (!sgx_usage_count) {
+ ret = sgx_update_svn();
+ if (ret)
+ return ret;
+ }
+
+ sgx_usage_count++;
+
+ return 0;
+}
+
+void sgx_dec_usage_count(void)
+{
+ guard(mutex)(&sgx_svn_lock);
+ sgx_usage_count--;
+}
+
+static int __init sgx_init(void)
+{
+ int ret;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX))
+ return -ENODEV;
+
+ if (!sgx_page_cache_init())
+ return -ENOMEM;
+
+ if (!sgx_page_reclaimer_init()) {
+ ret = -ENOMEM;
+ goto err_page_cache;
+ }
+
+ ret = misc_register(&sgx_dev_provision);
+ if (ret)
+ goto err_kthread;
+
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
+ return 0;
+
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
+err_kthread:
+ kthread_stop(ksgxd_tsk);
+
+err_page_cache:
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ vfree(sgx_epc_sections[i].pages);
+ memunmap(sgx_epc_sections[i].virt_addr);
+ }
+
+ return ret;
+}
+
+device_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..f5940393d9bd
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/sgx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/arch/x86/sgx.rst for more information."
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_EEXTEND_BLOCK_SIZE 256
+#define SGX_NR_TO_SCAN 16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
+/* Pages, which are being tracked by the page reclaimer. */
+#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+
+/* Pages on free list */
+#define SGX_EPC_PAGE_IS_FREE BIT(1)
+
+struct sgx_epc_page {
+ unsigned int section;
+ u16 flags;
+ u16 poison;
+ struct sgx_encl_page *owner;
+ struct list_head list;
+};
+
+/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ struct list_head sgx_poison_page_list;
+ unsigned long size;
+ spinlock_t lock;
+};
+
+/*
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ */
+struct sgx_epc_section {
+ unsigned long phys_addr;
+ void *virt_addr;
+ struct sgx_epc_page *pages;
+ struct sgx_numa_node *node;
+};
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->phys_addr + index * PAGE_SIZE;
+}
+
+static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->virt_addr + index * PAGE_SIZE;
+}
+
+struct sgx_epc_page *__sgx_alloc_epc_page(void);
+void sgx_free_epc_page(struct sgx_epc_page *page);
+
+void sgx_reclaim_direct(void);
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+
+void sgx_ipi_cb(void *info);
+
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+int sgx_inc_usage_count(void);
+void sgx_dec_usage_count(void);
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
+#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..8de1f1a755f2
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/kvm_types.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_remove_page(struct sgx_epc_page *epc_page)
+{
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ return __eremove(sgx_get_epc_virt_addr(epc_page));
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret = sgx_vepc_remove_page(epc_page);
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+ return 0;
+}
+
+static long sgx_vepc_remove_all(struct sgx_vepc *vepc)
+{
+ struct sgx_epc_page *entry;
+ unsigned long index;
+ long failures = 0;
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ int ret = sgx_vepc_remove_page(entry);
+ if (ret) {
+ if (ret == SGX_CHILD_PRESENT) {
+ /* The page is a SECS, userspace will retry. */
+ failures++;
+ } else {
+ /*
+ * Report errors due to #GP or SGX_ENCLAVE_ACT; do not
+ * WARN, as userspace can induce said failures by
+ * calling the ioctl concurrently on multiple vEPCs or
+ * while one or more CPUs is running the enclave. Only
+ * a #PF on EREMOVE indicates a kernel/hardware issue.
+ */
+ WARN_ON_ONCE(encls_faulted(ret) &&
+ ENCLS_TRAPNR(ret) != X86_TRAP_GP);
+ return -EBUSY;
+ }
+ }
+ cond_resched();
+ }
+
+ /*
+ * Return the number of SECS pages that failed to be removed, so
+ * userspace knows that it has to retry.
+ */
+ return failures;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ cond_resched();
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ cond_resched();
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ xa_destroy(&vepc->page_array);
+ kfree(vepc);
+
+ sgx_dec_usage_count();
+ return 0;
+}
+
+static int __sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = sgx_inc_usage_count();
+ if (ret)
+ return ret;
+
+ ret = __sgx_vepc_open(inode, file);
+ if (ret) {
+ sgx_dec_usage_count();
+ return ret;
+ }
+
+ return 0;
+}
+
+static long sgx_vepc_ioctl(struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ switch (cmd) {
+ case SGX_IOC_VEPC_REMOVE_ALL:
+ if (arg)
+ return -EINVAL;
+ return sgx_vepc_remove_all(vepc);
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .unlocked_ioctl = sgx_vepc_ioctl,
+ .compat_ioctl = sgx_vepc_ioctl,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4397e987a1cf..f55ea3cdbf88 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,99 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
+ * CPU/APIC topology
+ *
+ * The APIC IDs describe the system topology in multiple domain levels.
+ * The CPUID topology parser provides the information which part of the
+ * APIC ID is associated to the individual levels:
+ *
+ * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
+ *
+ * The root space contains the package (socket) IDs.
+ *
+ * Not enumerated levels consume 0 bits space, but conceptually they are
+ * always represented. If e.g. only CORE and THREAD levels are enumerated
+ * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
+ *
+ * If SMT is not supported, then the THREAD domain is still used. It then
+ * has the same physical ID as the CORE domain and is the only child of
+ * the core domain.
+ *
+ * This allows a unified view on the system independent of the enumerated
+ * domain levels without requiring any conditionals in the code.
*/
-
+#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
#include <asm/apic.h>
-#include <asm/pat.h>
-#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/msr.h>
+#include <asm/smp.h>
-/* leaf 0xb SMT level */
-#define SMT_LEVEL 0
+#include "cpu.h"
+
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
-/* leaf 0xb sub-leaf types */
-#define INVALID_TYPE 0
-#define SMT_TYPE 1
-#define CORE_TYPE 2
+/* Bitmap of physically present CPUs. */
+DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+/* Used for CPU number allocation and parallel CPU bringup */
+u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
+
+/* Bitmaps to mark registered APICs at each topology domain */
+static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
+ * Keep track of assigned, disabled and rejected CPUs. Present assigned
+ * with 1 as CPU #0 is reserved for the boot CPU.
*/
-void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+static struct {
+ unsigned int nr_assigned_cpus;
+ unsigned int nr_disabled_cpus;
+ unsigned int nr_rejected_cpus;
+ u32 boot_cpu_apic_id;
+ u32 real_bsp_apic_id;
+} topo_info __ro_after_init = {
+ .nr_assigned_cpus = 1,
+ .boot_cpu_apic_id = BAD_APICID,
+ .real_bsp_apic_id = BAD_APICID,
+};
+
+#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
+
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- static bool printed;
+ return phys_id == (u64)cpuid_to_apicid[cpu];
+}
- if (c->cpuid_level < 0xb)
- return;
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ if (!(apicid & (__max_threads_per_core - 1)))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
+}
+
+/*
+ * Convert the APIC ID to a domain level ID by masking out the low bits
+ * below the domain level @dom.
+ */
+static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
+}
+
+static int topo_lookup_cpuid(u32 apic_id)
+{
+ int i;
- cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+ /* CPU# to APICID mapping is persistent once it is established */
+ for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
+ if (cpuid_to_apicid[i] == apic_id)
+ return i;
+ }
+ return -ENODEV;
+}
+
+static __init int topo_get_cpunr(u32 apic_id)
+{
+ int cpu = topo_lookup_cpuid(apic_id);
+
+ if (cpu >= 0)
+ return cpu;
+
+ return topo_info.nr_assigned_cpus++;
+}
+
+static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
+#endif
+ set_cpu_present(cpu, true);
+}
+
+static __init bool check_for_real_bsp(u32 apic_id)
+{
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
+
+ /*
+ * There is no real good way to detect whether this a kdump()
+ * kernel, but except on the Voyager SMP monstrosity which is not
+ * longer supported, the real BSP APIC ID is the first one which is
+ * enumerated by firmware. That allows to detect whether the boot
+ * CPU is the real BSP. If it is not, then do not register the APIC
+ * because sending INIT to the real BSP would reset the whole
+ * system.
+ *
+ * The first APIC ID which is enumerated by firmware is detectable
+ * because the boot CPU APIC ID is registered before that without
+ * invoking this code.
+ */
+ if (topo_info.real_bsp_apic_id != BAD_APICID)
+ return false;
/*
- * check if the cpuid leaf 0xb is actually implemented.
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
*/
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+ if (has_apic_base) {
+ rdmsrq(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
+ if (apic_id == topo_info.boot_cpu_apic_id) {
+ /*
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
+ */
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
+ }
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
+ }
+
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
+ topo_info.boot_cpu_apic_id, apic_id);
+
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
+ }
+
+ pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
+
+ topo_info.real_bsp_apic_id = apic_id;
+ return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
+}
+
+static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
+ unsigned long *map)
+{
+ unsigned int id, end, cnt = 0;
+
+ /* Calculate the exclusive end */
+ end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
+
+ /* Unfortunately there is no bitmap_weight_range() */
+ for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
+ cnt++;
+ return cnt;
+}
+
+static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ int cpu, dom;
+
+ if (present) {
+ set_bit(apic_id, phys_cpu_present_map);
+
+ /*
+ * Double registration is valid in case of the boot CPU
+ * APIC because that is registered before the enumeration
+ * of the APICs via firmware parsers or VM guest
+ * mechanisms.
+ */
+ if (apic_id == topo_info.boot_cpu_apic_id)
+ cpu = 0;
+ else
+ cpu = topo_get_cpunr(apic_id);
+
+ cpuid_to_apicid[cpu] = apic_id;
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ } else {
+ u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
+
+ /*
+ * Check for present APICs in the same package when running
+ * on bare metal. Allow the bogosity in a guest.
+ */
+ if (hypervisor_is_type(X86_HYPER_NATIVE) &&
+ topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
+ pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
+ apic_id);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_info.nr_disabled_cpus++;
+ }
+
+ /*
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
+ */
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
+ set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
+}
+
+/**
+ * topology_register_apic - Register an APIC in early topology maps
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ * @present: True if the corresponding CPU is present
+ */
+void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ if (apic_id >= MAX_LOCAL_APIC) {
+ pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ if (check_for_real_bsp(apic_id)) {
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ /* CPU numbers exhausted? */
+ if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
+ pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
+ topo_info.nr_rejected_cpus++;
return;
+ }
- set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+ topo_register_apic(apic_id, acpi_id, present);
+}
+
+/**
+ * topology_register_boot_apic - Register the boot CPU APIC
+ * @apic_id: The APIC ID to set up
+ *
+ * Separate so CPU #0 can be assigned
+ */
+void __init topology_register_boot_apic(u32 apic_id)
+{
+ WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
+
+ topo_info.boot_cpu_apic_id = apic_id;
+ topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+}
+
+/**
+ * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
+ * @apicid: The APIC ID for which to lookup the logical ID
+ * @at_level: The topology domain level to use
+ *
+ * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
+ * all bits below the domain level specified by @at_level to be clear. So both
+ * real APIC IDs and backshifted normalized APIC IDs work correctly.
+ *
+ * Returns:
+ * - >= 0: The requested logical ID
+ * - -ERANGE: @apicid is out of range
+ * - -ENODEV: @apicid is not registered
+ */
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return -ERANGE;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return -ENODEV;
+ /* Get the number of set bits before @lvlid. */
+ return bitmap_weight(apic_maps[at_level].map, lvlid);
+}
+EXPORT_SYMBOL_GPL(topology_get_logical_id);
+
+/**
+ * topology_unit_count - Retrieve the count of specified units at a given topology domain level
+ * @apicid: The APIC ID which specifies the search range
+ * @which_units: The domain level specifying the units to count
+ * @at_level: The domain level at which @which_units have to be counted
+ *
+ * This returns the number of possible units according to the enumerated
+ * information.
+ *
+ * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
+ * counts the number of possible cores in the package to which @apicid
+ * belongs.
+ *
+ * @at_level must obviously be greater than @which_level to produce useful
+ * results. If @at_level is equal to @which_units the result is
+ * unsurprisingly 1. If @at_level is less than @which_units the results
+ * is by definition undefined and the function returns 0.
+ */
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return 0;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return 0;
+ if (which_units > at_level)
+ return 0;
+ if (which_units == at_level)
+ return 1;
+ return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
+}
+
+#ifdef CONFIG_SMP
+int topology_get_primary_thread(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
/*
- * initial apic id, which also represents 32-bit extended x2apic id.
+ * Get the core domain level APIC id, which is the primary thread
+ * and return the CPU number assigned to it.
*/
- c->initial_apicid = edx;
+ return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN));
+}
+#endif
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+/**
+ * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ */
+int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
+{
+ int cpu;
+
+ if (apic_id >= MAX_LOCAL_APIC)
+ return -EINVAL;
+
+ /* Reject if the APIC ID was not registered during enumeration. */
+ if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
+ return -ENODEV;
+
+ cpu = topo_lookup_cpuid(apic_id);
+ if (cpu < 0)
+ return -ENOSPC;
+
+ set_bit(apic_id, phys_cpu_present_map);
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ cpu_mark_primary_thread(cpu, apic_id);
+ return cpu;
+}
+/**
+ * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
+ * @cpu: The CPU number for which the APIC ID is removed
+ */
+void topology_hotunplug_apic(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ if (apic_id == BAD_APICID)
+ return;
+
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
+ clear_bit(apic_id, phys_cpu_present_map);
+ set_cpu_present(cpu, false);
+}
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int max_possible_cpus __initdata = NR_CPUS;
+
+/**
+ * topology_apply_cmdline_limits_early - Apply topology command line limits early
+ *
+ * Ensure that command line limits are in effect before firmware parsing
+ * takes place.
+ */
+void __init topology_apply_cmdline_limits_early(void)
+{
+ unsigned int possible = nr_cpu_ids;
+
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
+ possible = 1;
+
+ /* 'possible_cpus=N' */
+ possible = min_t(unsigned int, max_possible_cpus, possible);
+
+ if (possible < nr_cpu_ids) {
+ pr_info("Limiting to %u possible CPUs\n", possible);
+ set_nr_cpu_ids(possible);
+ }
+}
+
+static __init bool restrict_to_up(void)
+{
+ if (!smp_found_config)
+ return true;
/*
- * Populate HT related information from sub-leaf level 0.
+ * XEN PV is special as it does not advertise the local APIC
+ * properly, but provides a fake topology for it so that the
+ * infrastructure works. So don't apply the restrictions vs. APIC
+ * here.
*/
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ if (xen_pv_domain())
+ return false;
- sub_index = 1;
- do {
- cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+ return apic_is_disabled;
+}
- /*
- * Check for the Core type in the implemented sub leaves.
- */
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- break;
+void __init topology_init_possible_cpus(void)
+{
+ unsigned int assigned = topo_info.nr_assigned_cpus;
+ unsigned int disabled = topo_info.nr_disabled_cpus;
+ unsigned int cnta, cntb, cpu, allowed = 1;
+ unsigned int total = assigned + disabled;
+ u32 apicid, firstid;
+
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
+ if (!restrict_to_up()) {
+ if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
+ disabled += assigned - nr_cpu_ids;
+ assigned = nr_cpu_ids;
}
+ allowed = min_t(unsigned int, total, nr_cpu_ids);
+ }
+
+ if (total > allowed)
+ pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
+
+ assigned = min_t(unsigned int, allowed, assigned);
+ disabled = allowed - assigned;
+
+ topo_info.nr_assigned_cpus = assigned;
+ topo_info.nr_disabled_cpus = disabled;
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+ total_cpus = allowed;
+ set_nr_cpu_ids(allowed);
- core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+ cnta = domain_weight(TOPO_PKG_DOMAIN);
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
+ __max_logical_packages = cnta;
+ __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
- c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width)
- & core_select_mask;
- c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+ pr_info("Max. logical packages: %3u\n", cnta);
+ pr_info("Max. logical dies: %3u\n", cntb);
+ pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
+
+ cnta = domain_weight(TOPO_CORE_DOMAIN);
+ cntb = domain_weight(TOPO_SMT_DOMAIN);
/*
- * Reinit the apicid, now that we have extended initial_apicid.
+ * Can't use order delta here as order(cnta) can be equal
+ * order(cntb) even if cnta != cntb.
*/
- c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
+ pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
+
+ firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
+ __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
+ __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
+
+ pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
+ if (topo_info.nr_rejected_cpus)
+ pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
- c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
- if (!printed) {
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- c->phys_proc_id);
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- c->cpu_core_id);
- printed = 1;
+ /* Assign CPU numbers to non-present CPUs */
+ for (apicid = 0; disabled; disabled--, apicid++) {
+ apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
+ MAX_LOCAL_APIC, apicid);
+ if (apicid >= MAX_LOCAL_APIC)
+ break;
+ cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ }
+
+ for (cpu = 0; cpu < allowed; cpu++) {
+ apicid = cpuid_to_apicid[cpu];
+
+ set_cpu_possible(cpu, true);
+
+ if (apicid == BAD_APICID)
+ continue;
+
+ cpu_mark_primary_thread(cpu, apicid);
+ set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
}
- return;
-#endif
}
+
+/*
+ * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
+ */
+void __init topology_reset_possible_cpus_up(void)
+{
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ if (topo_info.boot_cpu_apic_id != BAD_APICID)
+ set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
+}
+
+static int __init setup_possible_cpus(char *str)
+{
+ get_option(&str, &max_possible_cpus);
+ return 0;
+}
+early_param("possible_cpus", setup_possible_cpus);
+#endif
diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h
new file mode 100644
index 000000000000..37326297f80c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+ /* Legacy CPUID[1]:EBX[23:16] number of logical processors */
+ unsigned int ebx1_nproc_shift;
+
+ /* AMD specific node ID which cannot be mapped into APIC space. */
+ u16 amd_nodes_per_pkg;
+ u16 amd_node_id;
+};
+
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);
+void cpu_parse_topology_amd(struct topo_scan *tscan);
+void cpu_topology_fixup_amd(struct topo_scan *tscan);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level);
+#else
+static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ return 1;
+}
+#endif
+
+#endif /* ARCH_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
new file mode 100644
index 000000000000..6ac097e13106
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static bool parse_8000_0008(struct topo_scan *tscan)
+{
+ struct {
+ // ecx
+ u32 cpu_nthreads : 8, // Number of physical threads - 1
+ : 4, // Reserved
+ apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID
+ perf_tsc_len : 2, // Performance time-stamp counter size
+ : 14; // Reserved
+ } ecx;
+ unsigned int sft;
+
+ if (tscan->c->extended_cpuid_level < 0x80000008)
+ return false;
+
+ cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx);
+
+ /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */
+ sft = ecx.apicid_coreid_len;
+ if (!sft)
+ sft = get_count_order(ecx.cpu_nthreads + 1);
+
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ return true;
+}
+
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
+{
+ /*
+ * Starting with Fam 17h the DIE domain could probably be used to
+ * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps
+ * suggests it's the topmost bit(s) of the CPU cores area, but
+ * that's guess work and neither enumerated nor documented.
+ *
+ * Up to Fam 16h this does not work at all and the legacy node ID
+ * has to be used.
+ */
+ tscan->amd_nodes_per_pkg = nr_nodes;
+ tscan->amd_node_id = node_id;
+}
+
+static bool parse_8000_001e(struct topo_scan *tscan)
+{
+ struct {
+ // eax
+ u32 ext_apic_id : 32; // Extended APIC ID
+ // ebx
+ u32 core_id : 8, // Unique per-socket logical core unit ID
+ core_nthreads : 8, // #Threads per core (zero-based)
+ : 16; // Reserved
+ // ecx
+ u32 node_id : 8, // Node (die) ID of invoking logical CPU
+ nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket
+ : 21; // Reserved
+ // edx
+ u32 : 32; // Reserved
+ } leaf;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return false;
+
+ cpuid_leaf(0x8000001e, &leaf);
+
+ /*
+ * If leaf 0xb/0x26 is available, then the APIC ID and the domain
+ * shifts are set already.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) {
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
+ /*
+ * Leaf 0x8000008 sets the CORE domain shift but not the
+ * SMT domain shift. On CPUs with family >= 0x17, there
+ * might be hyperthreads.
+ */
+ if (tscan->c->x86 >= 0x17) {
+ /* Update the SMT domain, but do not propagate it. */
+ unsigned int nthreads = leaf.core_nthreads + 1;
+
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN,
+ get_count_order(nthreads), nthreads);
+ }
+ }
+
+ store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
+
+ if (tscan->c->x86_vendor == X86_VENDOR_AMD) {
+ if (tscan->c->x86 == 0x15)
+ tscan->c->topo.cu_id = leaf.core_id;
+
+ cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id);
+ } else {
+ /*
+ * Package ID is ApicId[6..] on certain Hygon CPUs. See
+ * commit e0ceeae708ce for explanation. The topology info
+ * is screwed up: The package shift is always 6 and the
+ * node ID is bit [4:5].
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) {
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6,
+ tscan->dom_ncpus[TOPO_CORE_DOMAIN]);
+ }
+ cacheinfo_hygon_init_llc_id(tscan->c);
+ }
+ return true;
+}
+
+static void parse_fam10h_node_id(struct topo_scan *tscan)
+{
+ union {
+ struct {
+ u64 node_id : 3,
+ nodes_per_pkg : 3,
+ unused : 58;
+ };
+ u64 msr;
+ } nid;
+
+ if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
+ return;
+
+ rdmsrq(MSR_FAM10H_NODE_ID, nid.msr);
+ store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
+ tscan->c->topo.llc_id = nid.node_id;
+}
+
+static void legacy_set_llc(struct topo_scan *tscan)
+{
+ unsigned int apicid = tscan->c->topo.initial_apicid;
+
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT,
+ MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0)
+ return;
+
+ rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval);
+ if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+}
+
+static void parse_topology_amd(struct topo_scan *tscan)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
+ /*
+ * Try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb. If either leaf is
+ * available, cpu_parse_topology_ext() will return true.
+ *
+ * If XTOPOLOGY leaves (0x26/0xb) are not available, try to
+ * get the CORE shift from leaf 0x8000_0008 first.
+ */
+ if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan))
+ return;
+
+ /*
+ * Prefer leaf 0x8000001e if available to get the SMT shift and
+ * the initial APIC ID if XTOPOLOGY leaves are not available.
+ */
+ if (parse_8000_001e(tscan))
+ return;
+
+ /* Try the NODEID MSR */
+ parse_fam10h_node_id(tscan);
+}
+
+void cpu_parse_topology_amd(struct topo_scan *tscan)
+{
+ tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
+ parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
+
+ if (tscan->amd_nodes_per_pkg > 1)
+ set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
+}
+
+void cpu_topology_fixup_amd(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+
+ /*
+ * Adjust the core_id relative to the node when there is more than
+ * one node.
+ */
+ if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1)
+ c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
new file mode 100644
index 000000000000..71625795d711
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+EXPORT_SYMBOL_GPL(x86_topo_system);
+
+unsigned int __amd_nodes_per_pkg __ro_after_init;
+EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+
+/* CPUs which are the primary SMT threads */
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ topology_update_dom(tscan, dom, shift, ncpus);
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
+static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+{
+ struct {
+ u32 cache_type : 5,
+ unused : 21,
+ ncores : 6;
+ } eax;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax);
+ if (!eax.cache_type)
+ return 1;
+
+ return eax.ncores + 1;
+}
+
+static void parse_legacy(struct topo_scan *tscan)
+{
+ unsigned int cores, core_shift, smt_shift = 0;
+ struct cpuinfo_x86 *c = tscan->c;
+
+ cores = parse_num_cores_legacy(c);
+ core_shift = get_count_order(cores);
+
+ if (cpu_has(c, X86_FEATURE_HT)) {
+ if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift))
+ smt_shift = tscan->ebx1_nproc_shift - core_shift;
+ /*
+ * The parser expects leaf 0xb/0x1f format, which means
+ * the number of logical processors at core level is
+ * counting threads.
+ */
+ core_shift += smt_shift;
+ cores <<= smt_shift;
+ }
+
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores);
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1);
+
+ return tscan->c->cpuid_level < 1;
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+
+ tscan->ebx1_nproc_shift = get_count_order(ebx.nproc);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD))
+ cpu_parse_topology_amd(tscan);
+ break;
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_ZHAOXIN:
+ parse_legacy(tscan);
+ break;
+ case X86_VENDOR_INTEL:
+ if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
+ break;
+ case X86_VENDOR_HYGON:
+ if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+ cpu_parse_topology_amd(tscan);
+ break;
+ }
+}
+
+static void topo_set_ids(struct topo_scan *tscan, bool early)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ }
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+
+ c->topo.amd_node_id = tscan->amd_node_id;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ cpu_topology_fixup_amd(tscan);
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (c->topo.initial_apicid != c->topo.apicid) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n",
+ cpu, c->topo.initial_apicid, c->topo.apicid);
+ }
+
+ if (c->topo.apicid != cpuid_to_apicid[cpu]) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n",
+ cpu, cpuid_to_apicid[cpu], c->topo.apicid);
+ }
+ }
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ topo_set_ids(&tscan, false);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan, true);
+
+ /*
+ * AMD systems have Nodes per package which cannot be mapped to
+ * APIC ID.
+ */
+ __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
new file mode 100644
index 000000000000..467b0326bf1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MAX_TYPE_0B = 3,
+ MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
+ TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE_1F = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ */
+static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
+};
+
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
+ unsigned int *last_dom)
+{
+ unsigned int dom, maxtype;
+ const unsigned int *map;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ switch (leaf) {
+ case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
+ case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
+ default: return false;
+ }
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ /*
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of following the
+ * usual "this can be fixed in software" principle.
+ */
+ dom = *last_dom + 1;
+ } else {
+ dom = map[sl.type];
+ *last_dom = dom;
+ }
+
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ unsigned int last_dom;
+ u32 subleaf;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Intel: Try leaf 0x1F first. */
+ if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
+ return true;
+
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
+ /* Intel/AMD: Fall back to leaf 0xB if available */
+ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
+}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 28000743bbb0..42c939827621 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,11 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
#include <asm/msr.h>
#include "cpu.h"
-static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+static void early_init_transmeta(struct cpuinfo_x86 *c)
{
u32 xlvl;
@@ -13,11 +15,11 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
xlvl = cpuid_eax(0x80860000);
if ((xlvl & 0xffff0000) == 0x80860000) {
if (xlvl >= 0x80860001)
- c->x86_capability[2] = cpuid_edx(0x80860001);
+ c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001);
}
}
-static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
+static void init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -34,7 +36,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860001) {
cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+ pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
(cpu_rev >> 16) & 0xff,
(cpu_rev >> 8) & 0xff,
@@ -45,10 +47,10 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
- printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+ pr_info("CPU: Processor revision %08X, %u MHz\n",
new_cpu_rev, cpu_freq);
}
- printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+ pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
(cms_rev1 >> 24) & 0xff,
(cms_rev1 >> 16) & 0xff,
(cms_rev1 >> 8) & 0xff,
@@ -77,13 +79,13 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
(void *)&cpu_info[56],
(void *)&cpu_info[60]);
cpu_info[64] = '\0';
- printk(KERN_INFO "CPU: %s\n", cpu_info);
+ pr_info("CPU: %s\n", cpu_info);
}
/* Unhide possibly hidden capability flags */
rdmsr(0x80860004, cap_mask, uk);
wrmsr(0x80860004, ~0, uk);
- c->x86_capability[0] = cpuid_edx(0x00000001);
+ c->x86_capability[CPUID_1_EDX] = cpuid_edx(0x00000001);
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
@@ -98,7 +100,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = {
+static const struct cpu_dev transmeta_cpu_dev = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_early_init = early_init_transmeta,
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
new file mode 100644
index 000000000000..209b5a22d880
--- /dev/null
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel Transactional Synchronization Extensions (TSX) control.
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * Author:
+ * Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+ */
+
+#include <linux/cpufeature.h>
+
+#include <asm/cmdline.h>
+#include <asm/cpu.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
+enum tsx_ctrl_states {
+ TSX_CTRL_AUTO,
+ TSX_CTRL_ENABLE,
+ TSX_CTRL_DISABLE,
+ TSX_CTRL_RTM_ALWAYS_ABORT,
+ TSX_CTRL_NOT_SUPPORTED,
+};
+
+static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init =
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO :
+ IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE;
+
+static void tsx_disable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Force all transactions to immediately abort */
+ tsx |= TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is not enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * do not waste resources trying TSX transactions that
+ * will always abort.
+ */
+ tsx |= TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static void tsx_enable(void)
+{
+ u64 tsx;
+
+ rdmsrq(MSR_IA32_TSX_CTRL, tsx);
+
+ /* Enable the RTM feature in the cpu */
+ tsx &= ~TSX_CTRL_RTM_DISABLE;
+
+ /*
+ * Ensure TSX support is enumerated in CPUID.
+ * This is visible to userspace and will ensure they
+ * can enumerate and use the TSX feature.
+ */
+ tsx &= ~TSX_CTRL_CPUID_CLEAR;
+
+ wrmsrq(MSR_IA32_TSX_CTRL, tsx);
+}
+
+static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_TAA))
+ return TSX_CTRL_DISABLE;
+
+ return TSX_CTRL_ENABLE;
+}
+
+/*
+ * Disabling TSX is not a trivial business.
+ *
+ * First of all, there's a CPUID bit: X86_FEATURE_RTM_ALWAYS_ABORT
+ * which says that TSX is practically disabled (all transactions are
+ * aborted by default). When that bit is set, the kernel unconditionally
+ * disables TSX.
+ *
+ * In order to do that, however, it needs to dance a bit:
+ *
+ * 1. The first method to disable it is through MSR_TSX_FORCE_ABORT and
+ * the MSR is present only when *two* CPUID bits are set:
+ *
+ * - X86_FEATURE_RTM_ALWAYS_ABORT
+ * - X86_FEATURE_TSX_FORCE_ABORT
+ *
+ * 2. The second method is for CPUs which do not have the above-mentioned
+ * MSR: those use a different MSR - MSR_IA32_TSX_CTRL and disable TSX
+ * through that one. Those CPUs can also have the initially mentioned
+ * CPUID bit X86_FEATURE_RTM_ALWAYS_ABORT set and for those the same strategy
+ * applies: TSX gets disabled unconditionally.
+ *
+ * When either of the two methods are present, the kernel disables TSX and
+ * clears the respective RTM and HLE feature flags.
+ *
+ * An additional twist in the whole thing presents late microcode loading
+ * which, when done, may cause for the X86_FEATURE_RTM_ALWAYS_ABORT CPUID
+ * bit to be set after the update.
+ *
+ * A subsequent hotplug operation on any logical CPU except the BSP will
+ * cause for the supported CPUID feature bits to get re-detected and, if
+ * RTM and HLE get cleared all of a sudden, but, userspace did consult
+ * them before the update, then funny explosions will happen. Long story
+ * short: the kernel doesn't modify CPUID feature bits after booting.
+ *
+ * That's why, this function's call in init_intel() doesn't clear the
+ * feature flags.
+ */
+static void tsx_clear_cpuid(void)
+{
+ u64 msr;
+
+ /*
+ * MSR_TFA_TSX_CPUID_CLEAR bit is only present when both CPUID
+ * bits RTM_ALWAYS_ABORT and TSX_FORCE_ABORT are present.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) &&
+ boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ rdmsrq(MSR_TSX_FORCE_ABORT, msr);
+ msr |= MSR_TFA_TSX_CPUID_CLEAR;
+ wrmsrq(MSR_TSX_FORCE_ABORT, msr);
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
+ rdmsrq(MSR_IA32_TSX_CTRL, msr);
+ msr |= TSX_CTRL_CPUID_CLEAR;
+ wrmsrq(MSR_IA32_TSX_CTRL, msr);
+ }
+}
+
+/*
+ * Disable TSX development mode
+ *
+ * When the microcode released in Feb 2022 is applied, TSX will be disabled by
+ * default on some processors. MSR 0x122 (TSX_CTRL) and MSR 0x123
+ * (IA32_MCU_OPT_CTRL) can be used to re-enable TSX for development, doing so is
+ * not recommended for production deployments. In particular, applying MD_CLEAR
+ * flows for mitigation of the Intel TSX Asynchronous Abort (TAA) transient
+ * execution attack may not be effective on these processors when Intel TSX is
+ * enabled with updated microcode.
+ */
+static void tsx_dev_mode_disable(void)
+{
+ u64 mcu_opt_ctrl;
+
+ /* Check if RTM_ALLOW exists */
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
+ !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
+ return;
+
+ rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+
+ if (mcu_opt_ctrl & RTM_ALLOW) {
+ mcu_opt_ctrl &= ~RTM_ALLOW;
+ wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl);
+ setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT);
+ }
+}
+
+static int __init tsx_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "on")) {
+ tsx_ctrl_state = TSX_CTRL_ENABLE;
+ } else if (!strcmp(str, "off")) {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ } else if (!strcmp(str, "auto")) {
+ tsx_ctrl_state = TSX_CTRL_AUTO;
+ } else {
+ tsx_ctrl_state = TSX_CTRL_DISABLE;
+ pr_err("invalid option, defaulting to off\n");
+ }
+
+ return 0;
+}
+early_param("tsx", tsx_parse_cmdline);
+
+void __init tsx_init(void)
+{
+ tsx_dev_mode_disable();
+
+ /*
+ * Hardware will always abort a TSX transaction when the CPUID bit
+ * RTM_ALWAYS_ABORT is set. In this case, it is better not to enumerate
+ * CPUID.RTM and CPUID.HLE bits. Clear them here.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) {
+ tsx_ctrl_state = TSX_CTRL_RTM_ALWAYS_ABORT;
+ tsx_clear_cpuid();
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ return;
+ }
+
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
+ tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
+ return;
+ }
+
+ if (tsx_ctrl_state == TSX_CTRL_AUTO)
+ tsx_ctrl_state = x86_get_tsx_auto_mode();
+
+ if (tsx_ctrl_state == TSX_CTRL_DISABLE) {
+ tsx_disable();
+
+ /*
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
+ } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
+
+ /*
+ * HW defaults TSX to be enabled at bootup.
+ * We may still need the TSX enable support
+ * during init for special cases like
+ * kexec after TSX is disabled.
+ */
+ tsx_enable();
+
+ /*
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
+ */
+ setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
+ }
+}
+
+void tsx_ap_init(void)
+{
+ tsx_dev_mode_disable();
+
+ if (tsx_ctrl_state == TSX_CTRL_ENABLE)
+ tsx_enable();
+ else if (tsx_ctrl_state == TSX_CTRL_DISABLE)
+ tsx_disable();
+ else if (tsx_ctrl_state == TSX_CTRL_RTM_ALWAYS_ABORT)
+ /* See comment over that function for more details. */
+ tsx_clear_cpuid();
+}
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index fd2c37bf7acb..65a58a390fc3 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -8,11 +8,11 @@
* so no special init takes place.
*/
-static const struct cpu_dev __cpuinitconst umc_cpu_dev = {
+static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
- .c_models = {
- { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
new file mode 100644
index 000000000000..e4a31c536642
--- /dev/null
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/syscore_ops.h>
+#include <linux/suspend.h>
+#include <linux/cpu.h>
+
+#include <asm/msr.h>
+#include <asm/mwait.h>
+
+#define UMWAIT_C02_ENABLE 0
+
+#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
+ (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
+ ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
+
+/*
+ * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
+ * umwait max time is 100000 in TSC-quanta and C0.2 is enabled
+ */
+static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
+
+/*
+ * Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
+ * hardware or BIOS before kernel boot.
+ */
+static u32 orig_umwait_control_cached __ro_after_init;
+
+/*
+ * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
+ * the sysfs write functions.
+ */
+static DEFINE_MUTEX(umwait_lock);
+
+static void umwait_update_control_msr(void * unused)
+{
+ lockdep_assert_irqs_disabled();
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached));
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the global control
+ * value.
+ *
+ * Disable interrupts so the read of umwait_control_cached and the WRMSR
+ * are protected against a concurrent sysfs write. Otherwise the sysfs
+ * write could update the cached value after it had been read on this CPU
+ * and issue the IPI before the old value had been written. The IPI would
+ * interrupt, write the new value and after return from IPI the previous
+ * value would be written by this CPU.
+ *
+ * With interrupts disabled the upcoming CPU either sees the new control
+ * value or the IPI is updating this CPU to the new control value after
+ * interrupts have been reenabled.
+ */
+static int umwait_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ umwait_update_control_msr(NULL);
+ local_irq_enable();
+ return 0;
+}
+
+/*
+ * The CPU hotplug callback sets the control MSR to the original control
+ * value.
+ */
+static int umwait_cpu_offline(unsigned int cpu)
+{
+ /*
+ * This code is protected by the CPU hotplug already and
+ * orig_umwait_control_cached is never changed after it caches
+ * the original control MSR value in umwait_init(). So there
+ * is no race condition here.
+ */
+ wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ return 0;
+}
+
+/*
+ * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
+ * is the only active CPU at this time. The MSR is set up on the APs via the
+ * CPU hotplug callback.
+ *
+ * This function is invoked on resume from suspend and hibernation. On
+ * resume from suspend the restore should be not required, but we neither
+ * trust the firmware nor does it matter if the same value is written
+ * again.
+ */
+static void umwait_syscore_resume(void *data)
+{
+ umwait_update_control_msr(NULL);
+}
+
+static const struct syscore_ops umwait_syscore_ops = {
+ .resume = umwait_syscore_resume,
+};
+
+static struct syscore umwait_syscore = {
+ .ops = &umwait_syscore_ops,
+};
+
+/* sysfs interface */
+
+/*
+ * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
+ * Otherwise, C0.2 is enabled.
+ */
+static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
+{
+ return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
+}
+
+static inline u32 umwait_ctrl_max_time(u32 ctrl)
+{
+ return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+}
+
+static inline void umwait_update_control(u32 maxtime, bool c02_enable)
+{
+ u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
+
+ if (!c02_enable)
+ ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
+
+ WRITE_ONCE(umwait_control_cached, ctrl);
+ /* Propagate to all CPUs */
+ on_each_cpu(umwait_update_control_msr, NULL, 1);
+}
+
+static ssize_t
+enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
+}
+
+static ssize_t enable_c02_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool c02_enable;
+ u32 ctrl;
+ int ret;
+
+ ret = kstrtobool(buf, &c02_enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
+ umwait_update_control(ctrl, c02_enable);
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(enable_c02);
+
+static ssize_t
+max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
+{
+ u32 ctrl = READ_ONCE(umwait_control_cached);
+
+ return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
+}
+
+static ssize_t max_time_store(struct device *kobj,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ u32 max_time, ctrl;
+ int ret;
+
+ ret = kstrtou32(buf, 0, &max_time);
+ if (ret)
+ return ret;
+
+ /* bits[1:0] must be zero */
+ if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
+ return -EINVAL;
+
+ mutex_lock(&umwait_lock);
+
+ ctrl = READ_ONCE(umwait_control_cached);
+ if (max_time != umwait_ctrl_max_time(ctrl))
+ umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
+
+ mutex_unlock(&umwait_lock);
+
+ return count;
+}
+static DEVICE_ATTR_RW(max_time);
+
+static struct attribute *umwait_attrs[] = {
+ &dev_attr_enable_c02.attr,
+ &dev_attr_max_time.attr,
+ NULL
+};
+
+static struct attribute_group umwait_attr_group = {
+ .attrs = umwait_attrs,
+ .name = "umwait_control",
+};
+
+static int __init umwait_init(void)
+{
+ struct device *dev;
+ int ret;
+
+ if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ return -ENODEV;
+
+ /*
+ * Cache the original control MSR value before the control MSR is
+ * changed. This is the only place where orig_umwait_control_cached
+ * is modified.
+ */
+ rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached);
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
+ umwait_cpu_online, umwait_cpu_offline);
+ if (ret < 0) {
+ /*
+ * On failure, the control MSR on all CPUs has the
+ * original control value.
+ */
+ return ret;
+ }
+
+ register_syscore(&umwait_syscore);
+
+ /*
+ * Add umwait control interface. Ignore failure, so at least the
+ * default values are set up in case the machine manages to boot.
+ */
+ dev = bus_get_dev_root(&cpu_subsys);
+ if (dev) {
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ put_device(dev);
+ }
+ return ret;
+}
+device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 03a36321ec54..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,126 +22,572 @@
*/
#include <linux/dmi.h>
-#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/efi.h>
+#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
+#include <asm/vmware.h>
+#include <asm/svm.h>
-#define CPUID_VMWARE_INFO_LEAF 0x40000000
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-#define VMWARE_HYPERVISOR_PORT 0x5658
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
-#define VMWARE_PORT_CMD_GETVERSION 10
-#define VMWARE_PORT_CMD_GETHZ 45
-#define VMWARE_PORT_CMD_GETVCPU_INFO 68
-#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
-#define VMWARE_PORT_CMD_VCPU_RESERVED 31
+#define CPUID_VMWARE_INFO_LEAF 0x40000000
+#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "0"(VMWARE_HYPERVISOR_MAGIC), \
- "1"(VMWARE_PORT_CMD_##cmd), \
- "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
- "memory");
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
+
+#define STEALCLOCK_NOT_AVAILABLE (-1)
+#define STEALCLOCK_DISABLED 0
+#define STEALCLOCK_ENABLED 1
+
+struct vmware_steal_time {
+ union {
+ u64 clock; /* stolen time counter in units of vtsc */
+ struct {
+ /* only for little-endian */
+ u32 clock_low;
+ u32 clock_high;
+ };
+ };
+ u64 reserved[7];
+};
+
+static unsigned long vmware_tsc_khz __ro_after_init;
+static u8 vmware_hypercall_mode __ro_after_init;
+
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz, lpj;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
+
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static bool vmw_sched_clock __initdata = true;
+static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
+static bool has_steal_clock;
+static bool steal_acc __initdata = true; /* steal time accounting */
+
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = false;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static __init int parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static noinstr u64 vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
+}
+
+static void __init vmware_cyc2ns_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
+{
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
+}
+
+static bool stealclock_enable(phys_addr_t pa)
+{
+ return vmware_cmd_stealclock(upper_32_bits(pa),
+ lower_32_bits(pa)) == STEALCLOCK_ENABLED;
+}
+
+static int __stealclock_disable(void)
+{
+ return vmware_cmd_stealclock(0, 1);
+}
+
+static void stealclock_disable(void)
+{
+ __stealclock_disable();
+}
+
+static bool vmware_is_stealclock_available(void)
+{
+ return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
+}
+
+/**
+ * vmware_steal_clock() - read the per-cpu steal clock
+ * @cpu: the cpu number whose steal clock we want to read
+ *
+ * The function reads the steal clock if we are on a 64-bit system, otherwise
+ * reads it in parts, checking that the high part didn't change in the
+ * meantime.
+ *
+ * Return:
+ * The steal clock reading in ns.
+ */
+static u64 vmware_steal_clock(int cpu)
+{
+ struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
+ u64 clock;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ clock = READ_ONCE(steal->clock);
+ else {
+ u32 initial_high, low, high;
+
+ do {
+ initial_high = READ_ONCE(steal->clock_high);
+ /* Do not reorder initial_high and high readings */
+ virt_rmb();
+ low = READ_ONCE(steal->clock_low);
+ /* Keep low reading in between */
+ virt_rmb();
+ high = READ_ONCE(steal->clock_high);
+ } while (initial_high != high);
+
+ clock = ((u64)high << 32) | low;
+ }
+
+ return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+}
+
+static void vmware_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ if (!stealclock_enable(slow_virt_to_phys(st))) {
+ has_steal_clock = false;
+ return;
+ }
+
+ pr_info("vmware-stealtime: cpu %d, pa %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
+}
+
+static void vmware_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ stealclock_disable();
+}
+
+static void vmware_guest_cpu_init(void)
+{
+ if (has_steal_clock)
+ vmware_register_steal_time();
+}
+
+static void vmware_pv_guest_cpu_reboot(void *unused)
+{
+ vmware_disable_steal_time();
+}
+
+static int vmware_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vmware_pv_reboot_nb = {
+ .notifier_call = vmware_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init vmware_smp_prepare_boot_cpu(void)
+{
+ vmware_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+static int vmware_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
- (unsigned long) tsc_hz / 1000,
- (unsigned long) tsc_hz % 1000);
+static int vmware_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_disable_steal_time();
+ local_irq_enable();
+ return 0;
+}
+#endif
- if (!preset_lpj) {
- lpj = ((u64)tsc_hz * 1000);
- do_div(lpj, HZ);
- preset_lpj = lpj;
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
}
- return tsc_hz;
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_ops.cpu.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz == 0)
+ return;
+
+ vmware_cyc2ns_setup();
+
+ if (vmw_sched_clock)
+ paravirt_set_sched_clock(vmware_sched_clock);
+
+ if (vmware_is_stealclock_available()) {
+ has_steal_clock = true;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
+
+ /* We use reboot notifier only to disable steal clock */
+ register_reboot_notifier(&vmware_pv_reboot_nb);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu =
+ vmware_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/vmware:online",
+ vmware_cpu_online,
+ vmware_cpu_down_prepare) < 0)
+ pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ vmware_guest_cpu_init();
+#endif
+ }
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
+/*
+ * VMware hypervisor takes care of exporting a reliable TSC to the guest.
+ * Still, due to timing difference when running on virtual cpus, the TSC can
+ * be marked as unstable in some cases. For example, the TSC sync check at
+ * bootup can fail due to a marginal offset between vcpus' TSCs (though the
+ * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
+ * is not suitable as a watchdog when running on a hypervisor because the
+ * kernel may miss a wrap of the counter if the vcpu is descheduled for a
+ * long time. To skip these checks at runtime we set these capability bits,
+ * so that the kernel could just trust the hypervisor with providing a
+ * reliable virtual TSC that is suitable for timekeeping.
+ */
+static void __init vmware_set_capabilities(void)
+{
+ setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMCALL);
+ else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
+ setup_force_cpu_cap(X86_FEATURE_VMW_VMMCALL);
}
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
+
+ if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
- if (ebx != UINT_MAX)
+ vmware_tsc_khz = tsc_khz;
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
- printk(KERN_WARNING
- "Failed to get TSC freq from the hypervisor\n");
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_period = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
+ pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
+ vmware_paravirt_ops_setup();
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
+ vmware_set_capabilities();
+}
+
+static u8 __init vmware_select_hypercall(void)
+{
+ int eax, ebx, ecx, edx;
+
+ cpuid(CPUID_VMWARE_FEATURES_LEAF, &eax, &ebx, &ecx, &edx);
+ return (ecx & (CPUID_VMWARE_FEATURES_ECX_VMMCALL |
+ CPUID_VMWARE_FEATURES_ECX_VMCALL));
}
/*
* While checking the dmi string information, just checking the product
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
+ * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
+ * intentionally defaults to 0.
*/
-static bool __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
- if (cpu_has_hypervisor) {
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
unsigned int hyper_vendor_id[3];
cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
&hyper_vendor_id[1], &hyper_vendor_id[2]);
- if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
- return true;
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) {
+ if (eax >= CPUID_VMWARE_FEATURES_LEAF)
+ vmware_hypercall_mode =
+ vmware_select_hypercall();
+
+ pr_info("hypercall mode: 0x%02x\n",
+ (unsigned int) vmware_hypercall_mode);
+
+ return CPUID_VMWARE_INFO_LEAF;
+ }
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return true;
+ return 1;
+
+ return 0;
+}
- return false;
+/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
+static bool __init vmware_legacy_x2apic_available(void)
+{
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
}
+#ifdef CONFIG_INTEL_TDX_GUEST
/*
- * VMware hypervisor takes care of exporting a reliable TSC to the guest.
- * Still, due to timing difference when running on virtual cpus, the TSC can
- * be marked as unstable in some cases. For example, the TSC sync check at
- * bootup can fail due to a marginal offset between vcpus' TSCs (though the
- * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
- * is not suitable as a watchdog when running on a hypervisor because the
- * kernel may miss a wrap of the counter if the vcpu is descheduled for a
- * long time. To skip these checks at runtime we set these capability bits,
- * so that the kernel could just trust the hypervisor with providing a
- * reliable virtual TSC that is suitable for timekeeping.
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
*/
-static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
{
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
-/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
-static bool __init vmware_legacy_x2apic_available(void)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
+ struct pt_regs *regs)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
- return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
- (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
+ /* Copy VMWARE specific Hypercall parameters to the GHCB */
+ ghcb_set_rip(ghcb, regs->ip);
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+ ghcb_set_rdi(ghcb, regs->di);
+ ghcb_set_rbp(ghcb, regs->bp);
+}
+
+static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ if (!(ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb) &&
+ ghcb_rsi_is_valid(ghcb) &&
+ ghcb_rdi_is_valid(ghcb) &&
+ ghcb_rbp_is_valid(ghcb)))
+ return false;
+
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
+
+ return true;
}
+#endif
-const __refconst struct hypervisor_x86 x86_hyper_vmware = {
- .name = "VMware",
- .detect = vmware_platform,
- .set_cpu_features = vmware_set_cpu_features,
- .init_platform = vmware_platform_setup,
- .x2apic_available = vmware_legacy_x2apic_available,
+const __initconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
+#endif
};
-EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpu/vortex.c b/arch/x86/kernel/cpu/vortex.c
new file mode 100644
index 000000000000..e2685470ba94
--- /dev/null
+++ b/arch/x86/kernel/cpu/vortex.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include "cpu.h"
+
+/*
+ * No special init required for Vortex processors.
+ */
+
+static const struct cpu_dev vortex_cpu_dev = {
+ .c_vendor = "Vortex",
+ .c_ident = { "Vortex86 SoC" },
+ .legacy_models = {
+ {
+ .family = 5,
+ .model_names = {
+ [2] = "Vortex86DX",
+ [8] = "Vortex86MX",
+ },
+ },
+ {
+ .family = 6,
+ .model_names = {
+ /*
+ * Both the Vortex86EX and the Vortex86EX2
+ * have the same family and model id.
+ *
+ * However, the -EX2 supports the product name
+ * CPUID call, so this name will only be used
+ * for the -EX, which does not.
+ */
+ [0] = "Vortex86EX",
+ },
+ },
+ },
+ .c_x86_vendor = X86_VENDOR_VORTEX,
+};
+
+cpu_dev_register(vortex_cpu_dev);
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
new file mode 100644
index 000000000000..89b1c8a70fe8
--- /dev/null
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+
+#include <asm/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/msr.h>
+
+#include "cpu.h"
+
+#define MSR_ZHAOXIN_FCR57 0x00001257
+
+#define ACE_PRESENT (1 << 6)
+#define ACE_ENABLED (1 << 7)
+#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
+
+#define RNG_PRESENT (1 << 2)
+#define RNG_ENABLED (1 << 3)
+#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
+
+static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
+{
+ u32 lo, hi;
+
+ /* Test for Extended Feature Flags presence */
+ if (cpuid_eax(0xC0000000) >= 0xC0000001) {
+ u32 tmp = cpuid_edx(0xC0000001);
+
+ /* Enable ACE unit, if present and disabled */
+ if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable ACE unit */
+ lo |= ACE_FCR;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled ACE h/w crypto\n");
+ }
+
+ /* Enable RNG unit, if present and disabled */
+ if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
+ rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ /* Enable RNG unit */
+ lo |= RNG_ENABLE;
+ wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
+ pr_info("CPU: Enabled h/w RNG\n");
+ }
+
+ /*
+ * Store Extended Feature Flags as word 5 of the CPU
+ * capability bit array
+ */
+ c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
+ }
+
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+}
+
+static void early_init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ if (c->x86 >= 0x6)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#endif
+ if (c->x86_power & (1 << 8)) {
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
+ }
+}
+
+static void init_zhaoxin(struct cpuinfo_x86 *c)
+{
+ early_init_zhaoxin(c);
+ init_intel_cacheinfo(c);
+
+ if (c->cpuid_level > 9) {
+ unsigned int eax = cpuid_eax(10);
+
+ /*
+ * Check for version and the number of counters
+ * Version(eax[7:0]) can't be 0;
+ * Counters(eax[15:8]) should be greater than 1;
+ */
+ if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (c->x86 >= 0x6)
+ init_zhaoxin_cap(c);
+#ifdef CONFIG_X86_64
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+#endif
+
+ init_ia32_feat_ctl(c);
+}
+
+#ifdef CONFIG_X86_32
+static unsigned int
+zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
+{
+ return size;
+}
+#endif
+
+static const struct cpu_dev zhaoxin_cpu_dev = {
+ .c_vendor = "zhaoxin",
+ .c_ident = { " Shanghai " },
+ .c_early_init = early_init_zhaoxin,
+ .c_init = init_zhaoxin,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = zhaoxin_size_cache,
+#endif
+ .c_x86_vendor = X86_VENDOR_ZHAOXIN,
+};
+
+cpu_dev_register(zhaoxin_cpu_dev);