diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
23 files changed, 733 insertions, 218 deletions
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f769d6d08b43..380753b14cab 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -956,7 +956,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (cpu_has(c, X86_FEATURE_XMM2)) { + if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) { /* * Use LFENCE for execution serialization. On families which * don't have that MSR, LFENCE is already serializing. @@ -1158,24 +1158,43 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) return false; } -void set_dr_addr_mask(unsigned long mask, int dr) +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask); + +static unsigned int amd_msr_dr_addr_masks[] = { + MSR_F16H_DR0_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK, + MSR_F16H_DR1_ADDR_MASK + 1, + MSR_F16H_DR1_ADDR_MASK + 2 +}; + +void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { - if (!boot_cpu_has(X86_FEATURE_BPEXT)) + int cpu = smp_processor_id(); + + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) return; - switch (dr) { - case 0: - wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0); - break; - case 1: - case 2: - case 3: - wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0); - break; - default: - break; - } + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return; + + if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) + return; + + wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; +} + +unsigned long amd_get_dr_addr_mask(unsigned int dr) +{ + if (!cpu_feature_enabled(X86_FEATURE_BPEXT)) + return 0; + + if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks))) + return 0; + + return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); } +EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); u32 amd_get_highest_perf(void) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 1f60a2b27936..fdbb5f07448f 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -330,7 +330,16 @@ static void __init bp_init_freq_invariance(void) static void disable_freq_invariance_workfn(struct work_struct *work) { + int cpu; + static_branch_disable(&arch_scale_freq_key); + + /* + * Set arch_freq_scale to a default value on all cpus + * This negates the effect of scaling + */ + for_each_possible_cpu(cpu) + per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE; } static DECLARE_WORK(disable_freq_invariance_work, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bca0bd8f4846..cf81848b72f4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -33,6 +33,7 @@ #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> +#include <asm/cpu.h> #include "cpu.h" @@ -86,7 +87,7 @@ void update_spec_ctrl_cond(u64 val) wrmsrl(MSR_IA32_SPEC_CTRL, val); } -u64 spec_ctrl_current(void) +noinstr u64 spec_ctrl_current(void) { return this_cpu_read(x86_spec_ctrl_current); } @@ -144,9 +145,17 @@ void __init check_bugs(void) * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ - if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -1229,9 +1238,9 @@ static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", - [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", - [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", - [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", + [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", + [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", + [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; @@ -1300,7 +1309,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n", + pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } @@ -1486,8 +1495,12 @@ static void __init spectre_v2_select_mitigation(void) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { - x86_spec_ctrl_base |= SPEC_CTRL_IBRS; - update_spec_ctrl(x86_spec_ctrl_base); + if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { + msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); + } else { + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + update_spec_ctrl(x86_spec_ctrl_base); + } } switch (mode) { @@ -1571,8 +1584,8 @@ static void __init spectre_v2_select_mitigation(void) /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around - * firmware calls only when IBRS / Enhanced IBRS aren't otherwise - * enabled. + * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't + * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index f4e5aa27eec6..4063e8991211 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -734,7 +734,7 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -835,9 +835,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) case LVL_3: l3 += cache_table[k].size; break; - case LVL_TRACE: - trace += cache_table[k].size; - break; } break; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9cfca3d7d0e2..a394bbba7a4b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -567,7 +567,7 @@ static __init int setup_disable_pku(char *arg) return 1; } __setup("nopku", setup_disable_pku); -#endif /* CONFIG_X86_64 */ +#endif #ifdef CONFIG_X86_KERNEL_IBT @@ -1093,6 +1093,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x8000001f) c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f); + if (c->extended_cpuid_level >= 0x80000021) + c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021); + init_scattered_cpuid_features(c); init_speculation_control(c); @@ -1226,8 +1229,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* Zhaoxin Family 7 */ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), @@ -1256,6 +1259,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define MMIO_SBDS BIT(2) /* CPU is affected by RETbleed, speculating where you would not expect it */ #define RETBLEED BIT(3) +/* CPU is affected by SMT (cross-thread) return predictions */ +#define SMT_RSB BIT(4) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1287,8 +1292,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED), - VULNBL_HYGON(0x18, RETBLEED), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), {} }; @@ -1338,8 +1343,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); - if (ia32_cap & ARCH_CAP_IBRS_ALL) + /* + * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature + * flag and protect from vendor-specific bugs via the whitelist. + */ + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { @@ -1401,10 +1414,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } - if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && - !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) - setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) + setup_force_cpu_bug(X86_BUG_SMT_RSB); if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1682,9 +1693,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c) if (!IS_ENABLED(CONFIG_X86_64)) return; - /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ - if (c->extended_cpuid_level >= 0x80000021 && - cpuid_eax(0x80000021) & BIT(6)) + if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE)) return; /* @@ -1953,13 +1962,13 @@ void __init identify_boot_cpu(void) if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) pr_info("CET detected: Indirect Branch Tracking enabled\n"); #ifdef CONFIG_X86_32 - sysenter_setup(); enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); tsx_init(); + lkgs_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -2125,7 +2134,6 @@ static void wait_for_master_cpu(int cpu) #endif } -#ifdef CONFIG_X86_64 static inline void setup_getcpu(int cpu) { unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); @@ -2147,6 +2155,7 @@ static inline void setup_getcpu(int cpu) write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } +#ifdef CONFIG_X86_64 static inline void ucode_cpu_init(int cpu) { if (cpu) @@ -2166,8 +2175,6 @@ static inline void tss_setup_ist(struct tss_struct *tss) #else /* CONFIG_X86_64 */ -static inline void setup_getcpu(int cpu) { } - static inline void ucode_cpu_init(int cpu) { show_ucode_info_early(); @@ -2297,30 +2304,45 @@ void cpu_init_secondary(void) #endif #ifdef CONFIG_MICROCODE_LATE_LOADING -/* +/** + * store_cpu_caps() - Store a snapshot of CPU capabilities + * @curr_info: Pointer where to store it + * + * Returns: None + */ +void store_cpu_caps(struct cpuinfo_x86 *curr_info) +{ + /* Reload CPUID max function as it might've changed. */ + curr_info->cpuid_level = cpuid_eax(0); + + /* Copy all capability leafs and pick up the synthetic ones. */ + memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability, + sizeof(curr_info->x86_capability)); + + /* Get the hardware CPUID leafs */ + get_cpu_cap(curr_info); +} + +/** + * microcode_check() - Check if any CPU capabilities changed after an update. + * @prev_info: CPU capabilities stored before an update. + * * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU * hotplug lock. + * + * Return: None */ -void microcode_check(void) +void microcode_check(struct cpuinfo_x86 *prev_info) { - struct cpuinfo_x86 info; + struct cpuinfo_x86 curr_info; perf_check_microcode(); - /* Reload CPUID max function as it might've changed. */ - info.cpuid_level = cpuid_eax(0); - - /* - * Copy all capability leafs to pick up the synthetic ones so that - * memcmp() below doesn't fail on that. The ones coming from CPUID will - * get overwritten in get_cpu_cap(). - */ - memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)); - - get_cpu_cap(&info); + store_cpu_caps(&curr_info); - if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability))) + if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability, + sizeof(prev_info->x86_capability))) return; pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n"); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 7c9b5893c30a..57a5349e6954 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -83,6 +83,4 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); extern void update_srbds_msr(void); -extern u64 x86_read_arch_cap_msr(void); - #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index d95221117129..f6748c8bd647 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -68,6 +68,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, + { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 10fb5b5c9efa..23c5072fbbb7 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -306,6 +306,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) if ((low & BIT(5)) && !((high >> 5) & 0x3)) high |= BIT(5); + this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + wrmsr(smca_config, low, high); } @@ -736,15 +738,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) if (m.status & MCI_STATUS_ADDRV) { m.addr = addr; - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m.addr >> 56) & 0x3f; - - m.addr &= GENMASK_ULL(55, lsb); - } + smca_extract_err_addr(&m); } if (mce_flags.smca) { diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2c8ec5c71712..7832a69d170e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -67,13 +67,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); -struct mce_bank { - u64 ctl; /* subevents to enable */ - - __u64 init : 1, /* initialise bank? */ - __reserved_1 : 63; -}; -static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); +DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); #define ATTR_LEN 16 /* One object for each MCE bank, shared by all CPUs */ @@ -579,7 +573,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, mce->severity != MCE_DEFERRED_SEVERITY) return NOTIFY_DONE; - pfn = mce->addr >> PAGE_SHIFT; + pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); mce->kflags |= MCE_HANDLED_UC; @@ -633,15 +627,7 @@ static noinstr void mce_read_aux(struct mce *m, int i) m->addr <<= shift; } - /* - * Extract [55:<lsb>] where lsb is the least significant - * *valid* bit of the address bits. - */ - if (mce_flags.smca) { - u8 lsb = (m->addr >> 56) & 0x3f; - - m->addr &= GENMASK_ULL(55, lsb); - } + smca_extract_err_addr(m); } if (mce_flags.smca) { @@ -1308,6 +1294,7 @@ static void kill_me_maybe(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); int flags = MF_ACTION_REQUIRED; + unsigned long pfn; int ret; p->mce_count = 0; @@ -1316,9 +1303,10 @@ static void kill_me_maybe(struct callback_head *cb) if (!p->mce_ripv) flags |= MF_MUST_KILL; - ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + ret = memory_failure(pfn, flags); if (!ret) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + set_mce_nospec(pfn); sync_core(); return; } @@ -1340,11 +1328,13 @@ static void kill_me_maybe(struct callback_head *cb) static void kill_me_never(struct callback_head *cb) { struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + unsigned long pfn; p->mce_count = 0; pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr); - if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0)) - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT; + if (!memory_failure(pfn, 0)) + set_mce_nospec(pfn); } static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 100fbeebdc72..a05ac0716ecf 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -105,8 +105,7 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr, { char *p; - strncpy(mce_helper, buf, sizeof(mce_helper)); - mce_helper[sizeof(mce_helper)-1] = 0; + strscpy(mce_helper, buf, sizeof(mce_helper)); p = strchr(mce_helper, '\n'); if (p) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 7e03f5b7f6bd..91a415553c27 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -177,6 +177,24 @@ struct mce_vendor_flags { extern struct mce_vendor_flags mce_flags; +struct mce_bank { + /* subevents to enable */ + u64 ctl; + + /* initialise bank? */ + __u64 init : 1, + + /* + * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates + * the LSB field is found in MCA_STATUS and not in MCA_ADDR. + */ + lsb_in_status : 1, + + __reserved_1 : 62; +}; + +DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + enum mca_msr { MCA_CTL, MCA_STATUS, @@ -189,8 +207,34 @@ extern bool filter_mce(struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); + +/* + * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits + * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR. + */ +static __always_inline void smca_extract_err_addr(struct mce *m) +{ + u8 lsb; + + if (!mce_flags.smca) + return; + + if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) { + lsb = (m->status >> 24) & 0x3f; + + m->addr &= GENMASK_ULL(56, lsb); + + return; + } + + lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); +} + #else static inline bool amd_filter_mce(struct mce *m) { return false; } +static inline void smca_extract_err_addr(struct mce *m) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 56471f750762..9eb457b10341 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -55,7 +55,9 @@ struct cont_desc { }; static u32 ucode_new_rev; -static u8 amd_ucode_patch[PATCH_MAX_SIZE]; + +/* One blob per node. */ +static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE]; /* * Microcode patch container file is prepended to the initrd in cpio @@ -330,8 +332,9 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true); if (ret < 0) { /* - * Patch verification failed, skip to the next - * container, if there's one: + * Patch verification failed, skip to the next container, if + * there is one. Before exit, check whether that container has + * found a patch already. If so, use it. */ goto out; } else if (ret > 0) { @@ -350,6 +353,7 @@ skip: size -= patch_size + SECTION_HDR_SIZE; } +out: /* * If we have found a patch (desc->mc), it means we're looking at the * container which has a patch for this CPU so return 0 to mean, @ucode @@ -364,7 +368,6 @@ skip: return 0; } -out: return orig_size - size; } @@ -414,8 +417,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc) * * Returns true if container found (sets @desc), false otherwise. */ -static bool -apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) +static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch) { struct cont_desc desc = { 0 }; u8 (*patch)[PATCH_MAX_SIZE]; @@ -428,7 +430,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); #else new_rev = &ucode_new_rev; - patch = &amd_ucode_patch; + patch = &amd_ucode_patch[0]; #endif desc.cpuid_1_eax = cpuid_1_eax; @@ -481,7 +483,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) return false; } -static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret) +static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) { struct ucode_cpu_info *uci; struct cpio_data cp; @@ -511,11 +513,11 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax) { struct cpio_data cp = { }; - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true); } void load_ucode_amd_ap(unsigned int cpuid_1_eax) @@ -546,15 +548,14 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) } } - __load_ucode_amd(cpuid_1_eax, &cp); + find_blobs_in_containers(cpuid_1_eax, &cp); if (!(cp.data && cp.size)) return; - apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false); + early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false); } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size); +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) { @@ -572,19 +573,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) if (!desc.mc) return -EINVAL; - ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); + ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); if (ret > UCODE_UPDATED) return -EINVAL; return 0; } -void reload_ucode_amd(void) +void reload_ucode_amd(unsigned int cpu) { - struct microcode_amd *mc; u32 rev, dummy __always_unused; + struct microcode_amd *mc; - mc = (struct microcode_amd *)amd_ucode_patch; + mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)]; rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); @@ -816,6 +817,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, return 0; } +/* Scan the blob in @data and add microcode patches to the cache. */ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { @@ -850,9 +852,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state -load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) { + struct cpuinfo_x86 *c; + unsigned int nid, cpu; struct ucode_patch *p; enum ucode_state ret; @@ -865,22 +868,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) return ret; } - p = find_patch(0); - if (!p) { - return ret; - } else { - if (boot_cpu_data.microcode >= p->patch_id) - return ret; + for_each_node(nid) { + cpu = cpumask_first(cpumask_of_node(nid)); + c = &cpu_data(cpu); - ret = UCODE_NEW; - } + p = find_patch(cpu); + if (!p) + continue; - /* save BSP's matching patch for early load */ - if (!save) - return ret; + if (c->microcode >= p->patch_id) + continue; - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + ret = UCODE_NEW; + + memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE); + memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE)); + } return ret; } @@ -905,14 +908,9 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; struct cpuinfo_x86 *c = &cpu_data(cpu); - bool bsp = c->cpu_index == boot_cpu_data.cpu_index; enum ucode_state ret = UCODE_NFOUND; const struct firmware *fw; - /* reload ucode container only on the boot cpu */ - if (!bsp) - return UCODE_OK; - if (c->x86 >= 0x15) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -925,7 +923,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) if (!verify_container(fw->data, fw->size, false)) goto fw_release; - ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size); + ret = load_microcode_amd(c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 712aafff96e0..7a329e561354 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -298,7 +298,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) #endif } -void reload_early_microcode(void) +void reload_early_microcode(unsigned int cpu) { int vendor, family; @@ -312,7 +312,7 @@ void reload_early_microcode(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - reload_ucode_amd(); + reload_ucode_amd(cpu); break; default: break; @@ -409,10 +409,10 @@ static int __reload_late(void *info) goto wait_for_siblings; if (err >= UCODE_NFOUND) { - if (err == UCODE_ERROR) + if (err == UCODE_ERROR) { pr_warn("Error reloading microcode on CPU %d\n", cpu); - - ret = -1; + ret = -1; + } } wait_for_siblings: @@ -438,6 +438,7 @@ wait_for_siblings: static int microcode_reload_late(void) { int old = boot_cpu_data.microcode, ret; + struct cpuinfo_x86 prev_info; pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n"); pr_err("You should switch to early loading, if possible.\n"); @@ -445,12 +446,21 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_in, 0); atomic_set(&late_cpus_out, 0); - ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret == 0) - microcode_check(); + /* + * Take a snapshot before the microcode update in order to compare and + * check whether any bits changed after an update. + */ + store_cpu_caps(&prev_info); - pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n", - old, boot_cpu_data.microcode); + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (!ret) { + pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n", + old, boot_cpu_data.microcode); + microcode_check(&prev_info); + } else { + pr_info("Reload failed, current microcode revision: 0x%x\n", + boot_cpu_data.microcode); + } return ret; } @@ -465,11 +475,8 @@ static ssize_t reload_store(struct device *dev, ssize_t ret = 0; ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val != 1) - return size; + if (ret || val != 1) + return -EINVAL; cpus_read_lock(); @@ -507,7 +514,7 @@ static ssize_t version_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct device *dev, +static ssize_t processor_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; @@ -515,8 +522,8 @@ static ssize_t pf_show(struct device *dev, return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static DEVICE_ATTR(version, 0444, version_show, NULL); -static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL); +static DEVICE_ATTR_RO(version); +static DEVICE_ATTR_RO(processor_flags); static struct attribute *mc_default_attrs[] = { &dev_attr_version.attr, @@ -557,7 +564,7 @@ void microcode_bsp_resume(void) if (uci->mc) microcode_ops->apply_microcode(cpu); else - reload_early_microcode(); + reload_early_microcode(cpu); } static struct syscore_ops mc_syscore_ops = { diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cac2bdb57f0b..467cf37ea90a 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -305,14 +305,11 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp) return false; } -/* - * Print ucode update info. - */ -static void -print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +static void print_ucode_info(int old_rev, int new_rev, unsigned int date) { - pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", - uci->cpu_sig.rev, + pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n", + old_rev, + new_rev, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -322,6 +319,7 @@ print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) static int delay_ucode_info; static int current_mc_date; +static int early_old_rev; /* * Print early updated ucode info after printk works. This is delayed info dump. @@ -332,7 +330,7 @@ void show_ucode_info_early(void) if (delay_ucode_info) { intel_cpu_collect_info(&uci); - print_ucode_info(&uci, current_mc_date); + print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date); delay_ucode_info = 0; } } @@ -341,40 +339,32 @@ void show_ucode_info_early(void) * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ -static void print_ucode(struct ucode_cpu_info *uci) +static void print_ucode(int old_rev, int new_rev, int date) { - struct microcode_intel *mc; int *delay_ucode_info_p; int *current_mc_date_p; - - mc = uci->mc; - if (!mc) - return; + int *early_old_rev_p; delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + early_old_rev_p = (int *)__pa_nodebug(&early_old_rev); *delay_ucode_info_p = 1; - *current_mc_date_p = mc->hdr.date; + *current_mc_date_p = date; + *early_old_rev_p = old_rev; } #else -static inline void print_ucode(struct ucode_cpu_info *uci) +static inline void print_ucode(int old_rev, int new_rev, int date) { - struct microcode_intel *mc; - - mc = uci->mc; - if (!mc) - return; - - print_ucode_info(uci, mc->hdr.date); + print_ucode_info(old_rev, new_rev, date); } #endif static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc; - u32 rev; + u32 rev, old_rev; mc = uci->mc; if (!mc) @@ -391,6 +381,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return UCODE_OK; } + old_rev = rev; + /* * Writeback and invalidate caches before updating microcode to avoid * internal issues depending on what the microcode is updating. @@ -407,9 +399,9 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) uci->cpu_sig.rev = rev; if (early) - print_ucode(uci); + print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date); else - print_ucode_info(uci, mc->hdr.date); + print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date); return 0; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 46668e255421..f924a76c6923 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -37,9 +37,76 @@ /* Is Linux running as the root partition? */ bool hv_root_partition; +/* Is Linux running on nested Microsoft Hypervisor */ +bool hv_nested; struct ms_hyperv_info ms_hyperv; #if IS_ENABLED(CONFIG_HYPERV) +static inline unsigned int hv_get_nested_reg(unsigned int reg) +{ + if (hv_is_sint_reg(reg)) + return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + + switch (reg) { + case HV_REGISTER_SIMP: + return HV_REGISTER_NESTED_SIMP; + case HV_REGISTER_SIEFP: + return HV_REGISTER_NESTED_SIEFP; + case HV_REGISTER_SVERSION: + return HV_REGISTER_NESTED_SVERSION; + case HV_REGISTER_SCONTROL: + return HV_REGISTER_NESTED_SCONTROL; + case HV_REGISTER_EOM: + return HV_REGISTER_NESTED_EOM; + default: + return reg; + } +} + +u64 hv_get_non_nested_register(unsigned int reg) +{ + u64 value; + + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) + hv_ghcb_msr_read(reg, &value); + else + rdmsrl(reg, value); + return value; +} +EXPORT_SYMBOL_GPL(hv_get_non_nested_register); + +void hv_set_non_nested_register(unsigned int reg, u64 value) +{ + if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { + hv_ghcb_msr_write(reg, value); + + /* Write proxy bit via wrmsl instruction */ + if (hv_is_sint_reg(reg)) + wrmsrl(reg, value | 1 << 20); + } else { + wrmsrl(reg, value); + } +} +EXPORT_SYMBOL_GPL(hv_set_non_nested_register); + +u64 hv_get_register(unsigned int reg) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + return hv_get_non_nested_register(reg); +} +EXPORT_SYMBOL_GPL(hv_get_register); + +void hv_set_register(unsigned int reg, u64 value) +{ + if (hv_nested) + reg = hv_get_nested_reg(reg); + + hv_set_non_nested_register(reg, value); +} +EXPORT_SYMBOL_GPL(hv_set_register); + static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); @@ -301,6 +368,11 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running as root partition\n"); } + if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { + hv_nested = true; + pr_info("Hyper-V: running on a nested hypervisor\n"); + } + /* * Extract host information. */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index c98e52ff5f20..030d3b409768 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -100,6 +100,18 @@ struct rdt_hw_resource rdt_resources_all[] = { .fflags = RFTYPE_RES_MB, }, }, + [RDT_RESOURCE_SMBA] = + { + .r_resctrl = { + .rid = RDT_RESOURCE_SMBA, + .name = "SMBA", + .cache_level = 3, + .domains = domain_init(RDT_RESOURCE_SMBA), + .parse_ctrlval = parse_bw, + .format_str = "%d=%*u", + .fflags = RFTYPE_RES_MB, + }, + }, }; /* @@ -150,6 +162,13 @@ bool is_mba_sc(struct rdt_resource *r) if (!r) return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + return r->membw.mba_sc; } @@ -213,9 +232,15 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; union cpuid_0x10_x_edx edx; - u32 ebx, ecx; + u32 ebx, ecx, subleaf; + + /* + * Query CPUID_Fn80000020_EDX_x01 for MBA and + * CPUID_Fn80000020_EDX_x02 for SMBA + */ + subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full); + cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->default_ctrl = MAX_MBA_BW_AMD; @@ -647,6 +672,8 @@ enum { RDT_FLAG_L2_CAT, RDT_FLAG_L2_CDP, RDT_FLAG_MBA, + RDT_FLAG_SMBA, + RDT_FLAG_BMEC, }; #define RDT_OPT(idx, n, f) \ @@ -670,6 +697,8 @@ static struct rdt_options rdt_options[] __initdata = { RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2), RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), + RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), + RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -699,7 +728,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -static bool __init rdt_cpu_has(int flag) +bool __init rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -734,6 +763,19 @@ static __init bool get_mem_config(void) return false; } +static __init bool get_slow_mem_config(void) +{ + struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA]; + + if (!rdt_cpu_has(X86_FEATURE_SMBA)) + return false; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return __rdt_get_mem_config_amd(&hw_res->r_resctrl); + + return false; +} + static __init bool get_rdt_alloc_resources(void) { struct rdt_resource *r; @@ -764,6 +806,9 @@ static __init bool get_rdt_alloc_resources(void) if (get_mem_config()) ret = true; + if (get_slow_mem_config()) + ret = true; + return ret; } @@ -853,6 +898,9 @@ static __init void rdt_init_res_defs_amd(void) } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; + } else if (r->rid == RDT_RESOURCE_SMBA) { + hw_res->msr_base = MSR_IA32_SMBA_BW_BASE; + hw_res->msr_update = mba_wrmsr_amd; } } } diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1df0e3262bca..eb07d4435391 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -209,7 +209,7 @@ static int parse_line(char *line, struct resctrl_schema *s, unsigned long dom_id; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - r->rid == RDT_RESOURCE_MBA) { + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); return -EINVAL; } @@ -310,7 +310,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) enum resctrl_conf_type t; cpumask_var_t cpu_mask; struct rdt_domain *d; - int cpu; u32 idx; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) @@ -341,13 +340,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) if (cpumask_empty(cpu_mask)) goto done; - cpu = get_cpu(); - /* Update resource control msr on this CPU if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_ctrl_update(&msr_param); - /* Update resource control msr on other CPUs. */ - smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); - put_cpu(); + + /* Update resource control msr on all the CPUs. */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); done: free_cpumask_var(cpu_mask); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5ebd28e6aa0c..8edecc5763d8 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -30,6 +30,29 @@ */ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) +/* Reads to Local DRAM Memory */ +#define READS_TO_LOCAL_MEM BIT(0) + +/* Reads to Remote DRAM Memory */ +#define READS_TO_REMOTE_MEM BIT(1) + +/* Non-Temporal Writes to Local Memory */ +#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) + +/* Non-Temporal Writes to Remote Memory */ +#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) + +/* Reads to Local Memory the system identifies as "Slow Memory" */ +#define READS_TO_LOCAL_S_MEM BIT(4) + +/* Reads to Remote Memory the system identifies as "Slow Memory" */ +#define READS_TO_REMOTE_S_MEM BIT(5) + +/* Dirty Victims to All Types of Memory */ +#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) + +/* Max event bits supported */ +#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) struct rdt_fs_context { struct kernfs_fs_context kfc; @@ -52,11 +75,13 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); * struct mon_evt - Entry in the event list of a resource * @evtid: event id * @name: name of the event + * @configurable: true if the event is configurable * @list: entry in &rdt_resource->evt_list */ struct mon_evt { enum resctrl_event_id evtid; char *name; + bool configurable; struct list_head list; }; @@ -409,6 +434,7 @@ enum resctrl_res_level { RDT_RESOURCE_L3, RDT_RESOURCE_L2, RDT_RESOURCE_MBA, + RDT_RESOURCE_SMBA, /* Must be the last */ RDT_NUM_RESOURCES, @@ -511,6 +537,7 @@ void closid_free(int closid); int alloc_rmid(void); void free_rmid(u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, @@ -527,5 +554,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); +void __init mbm_config_rftype_init(const char *config); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 77538abeb72a..7fe51488e136 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -204,6 +204,23 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, } } +/* + * Assumes that hardware counters are also reset and thus that there is + * no need to record initial non-zero counts. + */ +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +{ + struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + + if (is_mbm_total_enabled()) + memset(hw_dom->arch_mbm_total, 0, + sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); + + if (is_mbm_local_enabled()) + memset(hw_dom->arch_mbm_local, 0, + sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); +} + static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { u64 shift = 64 - width, chunks; @@ -763,7 +780,7 @@ static void l3_mon_evt_init(struct rdt_resource *r) list_add_tail(&mbm_local_event.list, &r->evt_list); } -int rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -800,6 +817,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) if (ret) return ret; + if (rdt_cpu_has(X86_FEATURE_BMEC)) { + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + mbm_total_event.configurable = true; + mbm_config_rftype_init("mbm_total_bytes_config"); + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + mbm_local_event.configurable = true; + mbm_config_rftype_init("mbm_local_bytes_config"); + } + } + l3_mon_evt_init(r); r->mon_capable = true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5993da21d822..e2c1599d1b37 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -325,12 +325,7 @@ static void update_cpu_closid_rmid(void *info) static void update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { - int cpu = get_cpu(); - - if (cpumask_test_cpu(cpu, cpu_mask)) - update_cpu_closid_rmid(r); - smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); - put_cpu(); + on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); } static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, @@ -1003,8 +998,11 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = of->kn->parent->priv; struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) + list_for_each_entry(mevt, &r->evt_list, list) { seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } return 0; } @@ -1215,7 +1213,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA) + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) continue; has_cache = true; list_for_each_entry(d, &r->domains, list) { @@ -1404,7 +1402,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, ctrl = resctrl_arch_get_config(r, d, closid, type); - if (r->rid == RDT_RESOURCE_MBA) + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) size = ctrl; else size = rdtgroup_cbm_to_size(r, d, ctrl); @@ -1421,6 +1420,248 @@ out: return ret; } +struct mon_config_info { + u32 evtid; + u32 mon_config; +}; + +#define INVALID_CONFIG_INDEX UINT_MAX + +/** + * mon_event_config_index_get - get the hardware index for the + * configurable event + * @evtid: event id. + * + * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID + * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID + * INVALID_CONFIG_INDEX for invalid evtid + */ +static inline unsigned int mon_event_config_index_get(u32 evtid) +{ + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return 0; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return 1; + default: + /* Should never reach here */ + return INVALID_CONFIG_INDEX; + } +} + +static void mon_event_config_read(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + u64 msrval; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + + /* Report only the valid event configuration bits */ + mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; +} + +static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) +{ + smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct mon_config_info mon_info = {0}; + struct rdt_domain *dom; + bool sep = false; + + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct mon_config_info)); + mon_info.evtid = evtid; + mondata_config_read(dom, &mon_info); + + seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mon_event_config_write(void *info) +{ + struct mon_config_info *mon_info = info; + unsigned int index; + + index = mon_event_config_index_get(mon_info->evtid); + if (index == INVALID_CONFIG_INDEX) { + pr_warn_once("Invalid event id %d\n", mon_info->evtid); + return; + } + wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); +} + +static int mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) +{ + struct mon_config_info mon_info = {0}; + int ret = 0; + + /* mon_config cannot be more than the supported set of events */ + if (val > MAX_EVT_CONFIG_BITS) { + rdt_last_cmd_puts("Invalid event configuration\n"); + return -EINVAL; + } + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.evtid = evtid; + mondata_config_read(d, &mon_info); + if (mon_info.mon_config == val) + goto out; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->cpu_mask, mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); + +out: + return ret; +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_domain *d; + int ret = 0; + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + ret = mbm_config_write_domain(r, d, evtid, val); + if (ret) + return -EINVAL; + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1520,6 +1761,20 @@ static struct rftype res_common_files[] = { .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, }, { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { .name = "cpus", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -1625,6 +1880,15 @@ void __init thread_throttle_mode_init(void) rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB; } +void __init mbm_config_rftype_init(const char *config) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE; +} + /** * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file * @r: The resource group with which the file is associated. @@ -1866,13 +2130,9 @@ static int set_cache_qos_cfg(int level, bool enable) /* Pick one CPU from each domain instance to update MSR */ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } - cpu = get_cpu(); - /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - update(&enable); - /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, update, &enable, 1); - put_cpu(); + + /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, update, &enable, 1); free_cpumask_var(cpu_mask); @@ -2349,7 +2609,7 @@ static int reset_all_ctrls(struct rdt_resource *r) struct msr_param msr_param; cpumask_var_t cpu_mask; struct rdt_domain *d; - int i, cpu; + int i; if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -2370,13 +2630,9 @@ static int reset_all_ctrls(struct rdt_resource *r) for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; } - cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ - if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_ctrl_update(&msr_param); - /* Update CBM on all other cpus in cpu_mask. */ - smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); - put_cpu(); + + /* Update CBM on all the CPUs in cpu_mask */ + on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); free_cpumask_var(cpu_mask); @@ -2855,7 +3111,8 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; - if (r->rid == RDT_RESOURCE_MBA) { + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { rdtgroup_init_mba(r, rdtgrp->closid); if (is_mba_sc(r)) continue; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index f53944fb8f7f..0dad49a09b7a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index 8009c8346d8f..b31ee4f1657a 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -11,6 +11,7 @@ #include <linux/cpufeature.h> #include <asm/cmdline.h> +#include <asm/cpu.h> #include "cpu.h" diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 02039ec3597d..11f83d07925e 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char *arg) } early_param("no-steal-acc", parse_no_stealacc); -static unsigned long long notrace vmware_sched_clock(void) +static noinstr u64 vmware_sched_clock(void) { unsigned long long ns; |