diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
67 files changed, 6710 insertions, 10744 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4efdf5c2efc8..2f8a58ef690e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,7 +24,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o -obj-y += cpuid-deps.o +obj-y += cpuid-deps.o cpuid_0x2_table.o obj-y += umwait.o obj-y += capflags.o powerflags.o @@ -38,6 +38,9 @@ obj-y += intel.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o +ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy) +obj-y += amd_cache_disable.o +endif obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o @@ -55,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 823f44f7bc94..bc94ff1e250a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -3,12 +3,13 @@ #include <linux/bitops.h> #include <linux/elf.h> #include <linux/mm.h> - +#include <linux/kvm_types.h> #include <linux/io.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> +#include <linux/platform_data/x86/amd-fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -21,6 +22,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/sev.h> #ifdef CONFIG_X86_64 @@ -29,7 +31,9 @@ #include "cpu.h" -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +u16 invlpgb_count_max __ro_after_init = 1; + +static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { u32 gprs[8] = { 0 }; int err; @@ -47,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +static inline int wrmsrq_amd_safe(unsigned msr, u64 val) { u32 gprs[8] = { 0 }; @@ -355,10 +359,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c) /* * RMP table entry format is not architectural and is defined by the * per-processor PPR. Restrict SNP support on the known CPU models - * for which the RMP table entry format is currently defined for. + * for which the RMP table entry format is currently defined or for + * processors which support the architecturally defined RMPREAD + * instruction. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && - c->x86 >= 0x19 && snp_probe_rmptable_info()) { + (cpu_feature_enabled(X86_FEATURE_ZEN3) || + cpu_feature_enabled(X86_FEATURE_ZEN4) || + cpu_feature_enabled(X86_FEATURE_RMPREAD)) && + snp_probe_rmptable_info()) { cc_platform_set(CC_ATTR_HOST_SEV_SNP); } else { setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); @@ -368,6 +377,47 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c) #endif } +#define ZEN_MODEL_STEP_UCODE(fam, model, step, ucode) \ + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, fam, model), \ + step, step, ucode) + +static const struct x86_cpu_id amd_tsa_microcode[] = { + ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x1, 0x0a0011d7), + ZEN_MODEL_STEP_UCODE(0x19, 0x01, 0x2, 0x0a00123b), + ZEN_MODEL_STEP_UCODE(0x19, 0x08, 0x2, 0x0a00820d), + ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x1, 0x0a10114c), + ZEN_MODEL_STEP_UCODE(0x19, 0x11, 0x2, 0x0a10124c), + ZEN_MODEL_STEP_UCODE(0x19, 0x18, 0x1, 0x0a108109), + ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x0, 0x0a20102e), + ZEN_MODEL_STEP_UCODE(0x19, 0x21, 0x2, 0x0a201211), + ZEN_MODEL_STEP_UCODE(0x19, 0x44, 0x1, 0x0a404108), + ZEN_MODEL_STEP_UCODE(0x19, 0x50, 0x0, 0x0a500012), + ZEN_MODEL_STEP_UCODE(0x19, 0x61, 0x2, 0x0a60120a), + ZEN_MODEL_STEP_UCODE(0x19, 0x74, 0x1, 0x0a704108), + ZEN_MODEL_STEP_UCODE(0x19, 0x75, 0x2, 0x0a705208), + ZEN_MODEL_STEP_UCODE(0x19, 0x78, 0x0, 0x0a708008), + ZEN_MODEL_STEP_UCODE(0x19, 0x7c, 0x0, 0x0a70c008), + ZEN_MODEL_STEP_UCODE(0x19, 0xa0, 0x2, 0x0aa00216), + {}, +}; + +static void tsa_init(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return; + + if (cpu_has(c, X86_FEATURE_ZEN3) || + cpu_has(c, X86_FEATURE_ZEN4)) { + if (x86_match_min_microcode_rev(amd_tsa_microcode)) + setup_force_cpu_cap(X86_FEATURE_VERW_CLEAR); + else + pr_debug("%s: current revision: 0x%x\n", __func__, c->microcode); + } else { + setup_force_cpu_cap(X86_FEATURE_TSA_SQ_NO); + setup_force_cpu_cap(X86_FEATURE_TSA_L1_NO); + } +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -376,7 +426,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) (c->x86 == 0x10 && c->x86_model >= 0x2)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -415,7 +465,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; @@ -465,6 +515,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; + case 0x50 ... 0x5f: + case 0x80 ... 0xaf: + case 0xc0 ... 0xcf: + setup_force_cpu_cap(X86_FEATURE_ZEN6); + break; default: goto warn; } @@ -475,6 +530,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } bsp_determine_snp(c); + tsa_init(c); + + if (cpu_has(c, X86_FEATURE_GP_ON_USER_CPUID)) + setup_force_cpu_cap(X86_FEATURE_CPUID_FAULT); + return; warn: @@ -486,6 +546,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) u64 msr; /* + * Mark using WBINVD is needed during kexec on processors that + * support SME. This provides support for performing a successful + * kexec when going from SME inactive to SME active (or vice-versa). + * + * The cache must be cleared so that if there are entries with the + * same physical address, both with and without the encryption bit, + * they don't race each other when flushed and potentially end up + * with the wrong entry being committed to memory. + * + * Test the CPUID bit directly because with mem_encrypt=off the + * BSP will clear the X86_FEATURE_SME bit and the APs will not + * see it set after that. + */ + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + __this_cpu_write(cache_state_incoherent, true); + + /* * BIOS support is required for SME and SEV. * For SME: If BIOS has enabled SME then adjust x86_phys_bits by * the SME physical address space reduction value. @@ -501,7 +578,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; @@ -518,7 +595,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (!sme_me_mask) setup_clear_cpu_cap(X86_FEATURE_SME); - rdmsrl(MSR_K7_HWCR, msr); + rdmsrq(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -605,7 +682,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); setup_force_cpu_cap(X86_FEATURE_SBPB); } @@ -627,16 +704,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { + if (!rdmsrq_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); - wrmsrl_amd_safe(0xc001100d, value); + wrmsrq_amd_safe(0xc001100d, value); } } if (!c->x86_model_id[0]) - strcpy(c->x86_model_id, "Hammer"); + strscpy(c->x86_model_id, "Hammer"); #ifdef CONFIG_SMP /* @@ -781,9 +858,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(MSR_F15H_IC_CFG, value); + wrmsrq_safe(MSR_F15H_IC_CFG, value); } } @@ -795,9 +872,10 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } -static const struct x86_cpu_desc erratum_1386_microcode[] = { - AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e), - AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052), +static const struct x86_cpu_id erratum_1386_microcode[] = { + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + {} }; static void fix_erratum_1386(struct cpuinfo_x86 *c) @@ -813,7 +891,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) * Clear the feature flag only on microcode revisions which * don't have the fix. */ - if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode)) + if (x86_match_min_microcode_rev(erratum_1386_microcode)) return; clear_cpu_cap(c, X86_FEATURE_XSAVES); @@ -831,9 +909,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * suppresses non-branch predictions. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif @@ -861,6 +939,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); + + /* + * Turn off the Instructions Retired free counter on machines that are + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (c->x86_model < 0x30) { + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + clear_cpu_cap(c, X86_FEATURE_IRPERF); + } } static bool cpu_has_zenbleed_microcode(void) @@ -905,6 +993,16 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) init_spectral_chicken(c); fix_erratum_1386(c); zen2_zenbleed_check(c); + + /* Disable RDSEED on AMD Cyan Skillfish because of an error. */ + if (c->x86_model == 0x47 && c->x86_stepping == 0x0) { + clear_cpu_cap(c, X86_FEATURE_RDSEED); + msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18); + pr_emerg("RDSEED is not reliable on this platform; disabling.\n"); + } + + /* Correct misconfigured CPUID on some clients. */ + clear_cpu_cap(c, X86_FEATURE_INVLPGB); } static void init_amd_zen3(struct cpuinfo_x86 *c) @@ -937,8 +1035,26 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) } } +static const struct x86_cpu_id zen5_rdseed_microcode[] = { + ZEN_MODEL_STEP_UCODE(0x1a, 0x02, 0x1, 0x0b00215a), + ZEN_MODEL_STEP_UCODE(0x1a, 0x08, 0x1, 0x0b008121), + ZEN_MODEL_STEP_UCODE(0x1a, 0x11, 0x0, 0x0b101054), + ZEN_MODEL_STEP_UCODE(0x1a, 0x24, 0x0, 0x0b204037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x0, 0x0b404035), + ZEN_MODEL_STEP_UCODE(0x1a, 0x44, 0x1, 0x0b404108), + ZEN_MODEL_STEP_UCODE(0x1a, 0x60, 0x0, 0x0b600037), + ZEN_MODEL_STEP_UCODE(0x1a, 0x68, 0x0, 0x0b608038), + ZEN_MODEL_STEP_UCODE(0x1a, 0x70, 0x0, 0x0b700037), + {}, +}; + static void init_amd_zen5(struct cpuinfo_x86 *c) { + if (!x86_match_min_microcode_rev(zen5_rdseed_microcode)) { + clear_cpu_cap(c, X86_FEATURE_RDSEED); + msr_clear_bit(MSR_AMD64_CPUID_FN_7, 18); + pr_emerg_once("RDSEED32 is broken. Disabling the corresponding CPUID bit.\n"); + } } static void init_amd(struct cpuinfo_x86 *c) @@ -1007,7 +1123,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -1044,13 +1160,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - /* - * Turn on the Instructions Retired free counter on machines not - * susceptible to erratum #1054 "Instructions Retired Performance - * Counter May Be Inaccurate". - */ - if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) + /* Enable the Instructions Retired free counter */ + if (cpu_has(c, X86_FEATURE_IRPERF)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); @@ -1064,10 +1175,14 @@ static void init_amd(struct cpuinfo_x86 *c) */ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && cpu_has(c, X86_FEATURE_AUTOIBRS)) - WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0); /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + + /* Enable Translation Cache Extension */ + if (cpu_has(c, X86_FEATURE_TCE)) + msr_set_bit(MSR_EFER, _EFER_TCE); } #ifdef CONFIG_X86_32 @@ -1100,8 +1215,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1114,26 +1229,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; + + tlb_lli_4m = tlb_lli_2m >> 1; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + /* Max number of pages INVLPGB can invalidate in one shot */ + if (cpu_has(c, X86_FEATURE_INVLPGB)) + invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; } static const struct cpu_dev amd_cpu_dev = { @@ -1185,7 +1304,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) return; - wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + wrmsrq(amd_msr_dr_addr_masks[dr], mask); per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; } @@ -1199,7 +1318,7 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) return per_cpu(amd_dr_addr_mask[dr], smp_processor_id()); } -EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); +EXPORT_SYMBOL_FOR_KVM(amd_get_dr_addr_mask); static void zenbleed_check_cpu(void *unused) { @@ -1216,3 +1335,72 @@ void amd_check_microcode(void) if (cpu_feature_enabled(X86_FEATURE_ZEN2)) on_each_cpu(zenbleed_check_cpu, NULL, 1); } + +static const char * const s5_reset_reason_txt[] = { + [0] = "thermal pin BP_THERMTRIP_L was tripped", + [1] = "power button was pressed for 4 seconds", + [2] = "shutdown pin was tripped", + [4] = "remote ASF power off command was received", + [9] = "internal CPU thermal limit was tripped", + [16] = "system reset pin BP_SYS_RST_L was tripped", + [17] = "software issued PCI reset", + [18] = "software wrote 0x4 to reset control register 0xCF9", + [19] = "software wrote 0x6 to reset control register 0xCF9", + [20] = "software wrote 0xE to reset control register 0xCF9", + [21] = "ACPI power state transition occurred", + [22] = "keyboard reset pin KB_RST_L was tripped", + [23] = "internal CPU shutdown event occurred", + [24] = "system failed to boot before failed boot timer expired", + [25] = "hardware watchdog timer expired", + [26] = "remote ASF reset command was received", + [27] = "an uncorrected error caused a data fabric sync flood event", + [29] = "FCH and MP1 failed warm reset handshake", + [30] = "a parity error occurred", + [31] = "a software sync flood event occurred", +}; + +static __init int print_s5_reset_status_mmio(void) +{ + void __iomem *addr; + u32 value; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value)); + if (!addr) + return 0; + + value = ioread32(addr); + + /* Value with "all bits set" is an error response and should be ignored. */ + if (value == U32_MAX) { + iounmap(addr); + return 0; + } + + /* + * Clear all reason bits so they won't be retained if the next reset + * does not update the register. Besides, some bits are never cleared by + * hardware so it's software's responsibility to clear them. + * + * Writing the value back effectively clears all reason bits as they are + * write-1-to-clear. + */ + iowrite32(value, addr); + iounmap(addr); + + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { + if (!(value & BIT(i))) + continue; + + if (s5_reset_reason_txt[i]) { + pr_info("x86/amd: Previous system reset reason [0x%08x]: %s\n", + value, s5_reset_reason_txt[i]); + } + } + + return 0; +} +late_initcall(print_s5_reset_status_mmio); diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c new file mode 100644 index 000000000000..8843b9557aea --- /dev/null +++ b/arch/x86/kernel/cpu/amd_cache_disable.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD L3 cache_disable_{0,1} sysfs handling + * Documentation/ABI/testing/sysfs-devices-system-cpu + */ + +#include <linux/cacheinfo.h> +#include <linux/capability.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <asm/amd/nb.h> + +#include "cpu.h" + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot) +{ + int index; + struct amd_northbridge *nb = ci->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sysfs_emit(buf, "%d\n", index); + + return sysfs_emit(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return show_cache_disable(ci, buf, slot); \ +} + +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf, + size_t count, unsigned int slot) +{ + struct amd_northbridge *nb = ci->priv; + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&ci->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return store_cache_disable(ci, buf, count, slot); \ +} + +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + + return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *ci = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!ci->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + static struct attribute **amd_l3_attrs; + int n = 1; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci) +{ + struct amd_northbridge *nb = ci->priv; + + if (ci->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +struct amd_northbridge *amd_init_l3_cache(int index) +{ + struct amd_northbridge *nb; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return NULL; + + node = topology_amd_node_id(smp_processor_id()); + nb = node_to_amd_nb(node); + if (nb && !nb->l3_cache.indices) + amd_calc_l3_indices(nb); + + return nb; +} diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index f642de2ebdac..7ffc78d5ebf2 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -20,6 +20,7 @@ #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "cpu.h" @@ -36,12 +37,12 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = { .seq = SEQCNT_ZERO(cpu_samples.seq) }; -static void init_counter_refs(void) +static void init_counter_refs(void *data) { u64 aperf, mperf; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); @@ -99,7 +100,7 @@ static bool __init turbo_disabled(void) u64 misc_en; int err; - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; @@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; @@ -152,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -190,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s u32 group_size; int err, i; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; @@ -220,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) u64 msr; int err; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -288,16 +289,20 @@ out: } #ifdef CONFIG_PM_SLEEP -static struct syscore_ops freq_invariance_syscore_ops = { +static const struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, }; -static void register_freq_invariance_syscore_ops(void) +static struct syscore freq_invariance_syscore = { + .ops = &freq_invariance_syscore_ops, +}; + +static void register_freq_invariance_syscore(void) { - register_syscore_ops(&freq_invariance_syscore_ops); + register_syscore(&freq_invariance_syscore); } #else -static inline void register_freq_invariance_syscore_ops(void) {} +static inline void register_freq_invariance_syscore(void) {} #endif static void freq_invariance_enable(void) @@ -307,7 +312,7 @@ static void freq_invariance_enable(void) return; } static_branch_enable_cpuslocked(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); + register_freq_invariance_syscore(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -474,8 +479,8 @@ void arch_scale_freq_tick(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; @@ -498,7 +503,7 @@ void arch_scale_freq_tick(void) */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) -unsigned int arch_freq_get_on_cpu(int cpu) +int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; @@ -534,7 +539,7 @@ static int __init bp_init_aperfmperf(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return 0; - init_counter_refs(); + init_counter_refs(NULL); bp_init_freq_invariance(); return 0; } @@ -543,5 +548,5 @@ early_initcall(bp_init_aperfmperf); void ap_init_aperfmperf(void) { if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - init_counter_refs(); + init_counter_refs(NULL); } diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 000000000000..f1a8ca3dd1ed --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/init.h> +#include <linux/export.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 47a01d4028f6..d0a2847a4bb0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include <linux/sched/smt.h> #include <linux/pgtable.h> #include <linux/bpf.h> +#include <linux/kvm_types.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> @@ -34,32 +35,41 @@ #include "cpu.h" -static void __init spectre_v1_select_mitigation(void); -static void __init spectre_v2_select_mitigation(void); -static void __init retbleed_select_mitigation(void); -static void __init spectre_v2_user_select_mitigation(void); -static void __init ssb_select_mitigation(void); -static void __init l1tf_select_mitigation(void); -static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); -static void __init taa_select_mitigation(void); -static void __init mmio_select_mitigation(void); -static void __init srbds_select_mitigation(void); -static void __init l1d_flush_select_mitigation(void); -static void __init srso_select_mitigation(void); -static void __init gds_select_mitigation(void); +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; -EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); +/* + * Set when the CPU has run a potentially malicious guest. An IBPB will + * be needed to before running userspace. That IBPB will flush the branch + * predictor content. + */ +DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; @@ -67,11 +77,18 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + x86_return_thunk = thunk; + + pr_info("active return thunk: %ps\n", thunk); +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } /* @@ -90,7 +107,7 @@ void update_spec_ctrl_cond(u64 val) * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) @@ -113,9 +130,13 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before idling (halt, mwait) */ -DEFINE_STATIC_KEY_FALSE(mds_idle_clear); -EXPORT_SYMBOL_GPL(mds_idle_clear); +/* Control IBPB on vCPU load */ +DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); +EXPORT_SYMBOL_FOR_KVM(switch_vcpu_ibpb); + +/* Control CPU buffer clear before idling (halt, mwait) */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_idle_clear); +EXPORT_SYMBOL_GPL(cpu_buf_idle_clear); /* * Controls whether l1d flush based mitigations are enabled, @@ -124,57 +145,37 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +#undef pr_fmt +#define pr_fmt(fmt) "mitigations: " fmt -void __init cpu_select_mitigations(void) +static void __init cpu_print_attack_vectors(void) { - /* - * Read the SPEC_CTRL MSR to account for reserved bits which may - * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD - * init code as it is not enumerated and depends on the family. - */ - if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + pr_info("Enabled attack vectors: "); - /* - * Previously running kernel (kexec), may have some controls - * turned ON. Clear them and let the mitigations setup below - * rediscover them based on configuration. - */ - x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; - } + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) + pr_cont("user_kernel, "); - x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) + pr_cont("user_user, "); - /* Select the proper CPU mitigations before patching alternatives: */ - spectre_v1_select_mitigation(); - spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ - retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ - spectre_v2_user_select_mitigation(); - ssb_select_mitigation(); - l1tf_select_mitigation(); - md_clear_select_mitigation(); - srbds_select_mitigation(); - l1d_flush_select_mitigation(); + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) + pr_cont("guest_host, "); - /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). - */ - srso_select_mitigation(); - gds_select_mitigation(); + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) + pr_cont("guest_guest, "); + + pr_cont("SMT mitigations: "); + + switch (smt_mitigations) { + case SMT_MITIGATIONS_OFF: + pr_cont("off\n"); + break; + case SMT_MITIGATIONS_AUTO: + pr_cont("auto\n"); + break; + case SMT_MITIGATIONS_ON: + pr_cont("on\n"); + } } /* @@ -217,24 +218,86 @@ x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) speculation_ctrl_update(tif); } } -EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); +EXPORT_SYMBOL_FOR_KVM(x86_virt_spec_ctrl); static void x86_amd_ssb_disable(void) { u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - wrmsrl(MSR_AMD64_LS_CFG, msrval); + wrmsrq(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt +/* + * Returns true if vulnerability should be mitigated based on the + * selected attack vector controls. + * + * See Documentation/admin-guide/hw-vuln/attack_vector_controls.rst + */ +static bool __init should_mitigate_vuln(unsigned int bug) +{ + switch (bug) { + /* + * The only runtime-selected spectre_v1 mitigations in the kernel are + * related to SWAPGS protection on kernel entry. Therefore, protection + * is only required for the user->kernel attack vector. + */ + case X86_BUG_SPECTRE_V1: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL); + + case X86_BUG_SPECTRE_V2: + case X86_BUG_RETBLEED: + case X86_BUG_L1TF: + case X86_BUG_ITS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + + case X86_BUG_SPECTRE_V2_USER: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST); + + /* + * All the vulnerabilities below allow potentially leaking data + * across address spaces. Therefore, mitigation is required for + * any of these 4 attack vectors. + */ + case X86_BUG_MDS: + case X86_BUG_TAA: + case X86_BUG_MMIO_STALE_DATA: + case X86_BUG_RFDS: + case X86_BUG_SRBDS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST); + + case X86_BUG_GDS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST) || + (smt_mitigations != SMT_MITIGATIONS_OFF); + + case X86_BUG_SPEC_STORE_BYPASS: + return cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER); + + case X86_BUG_VMSCAPE: + return cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST); + + default: + WARN(1, "Unknown bug %x\n", bug); + return false; + } +} + /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -243,21 +306,91 @@ static const char * const mds_strings[] = { [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_AUTO, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF; + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_AUTO, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF; + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_AUTO, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; + +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing on exit to + * userspace *and* on entry to KVM guests. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MDS)) { mds_mitigation = MDS_MITIGATION_OFF; return; } + if (mds_mitigation == MDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_MDS)) + mds_mitigation = MDS_MITIGATION_FULL; + else + mds_mitigation = MDS_MITIGATION_OFF; + } + + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS)) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && - (mds_nosmt || cpu_mitigations_auto_nosmt())) + (mds_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); } } @@ -286,16 +419,6 @@ early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt -enum taa_mitigations { - TAA_MITIGATION_OFF, - TAA_MITIGATION_UCODE_NEEDED, - TAA_MITIGATION_VERW, - TAA_MITIGATION_TSX_DISABLED, -}; - -/* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -305,6 +428,11 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; +static bool __init taa_vulnerable(void) +{ + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} + static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { @@ -318,48 +446,65 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { - taa_mitigation = TAA_MITIGATION_OFF; - return; + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_TAA)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_OFF; } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && - !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + + if (taa_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -386,15 +531,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt -enum mmio_mitigations { - MMIO_MITIGATION_OFF, - MMIO_MITIGATION_UCODE_NEEDED, - MMIO_MITIGATION_VERW, -}; - -/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -405,32 +541,71 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || - cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_MMIO_STALE_DATA)) + mmio_mitigation = MMIO_MITIGATION_VERW; + else + mmio_mitigation = MMIO_MITIGATION_OFF; + } + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + } else { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM_MMIO); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can @@ -438,24 +613,9 @@ static void __init mmio_select_mitigation(void) * is required irrespective of SMT state. */ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) - static_branch_enable(&mds_idle_clear); + static_branch_enable(&cpu_buf_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) + if (mmio_nosmt || smt_mitigations == SMT_MITIGATIONS_ON) cpu_smt_disable(false); } @@ -483,35 +643,60 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Register File Data Sampling: " fmt -enum rfds_mitigations { - RFDS_MITIGATION_OFF, - RFDS_MITIGATION_VERW, - RFDS_MITIGATION_UCODE_NEEDED, -}; - -/* Default mitigation for Register File Data Sampling */ -static enum rfds_mitigations rfds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; - static const char * const rfds_strings[] = { [RFDS_MITIGATION_OFF] = "Vulnerable", [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_RFDS)) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_RFDS)) + rfds_mitigation = RFDS_MITIGATION_VERW; + else + rfds_mitigation = RFDS_MITIGATION_OFF; + } + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + rfds_mitigation = RFDS_MITIGATION_VERW; + + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) { setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + } } static __init int rfds_parse_cmdline(char *str) @@ -532,76 +717,11 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - pr_info("MMIO Stale Data: Unknown: No mitigations\n"); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, @@ -609,7 +729,7 @@ enum srbds_mitigations { }; static enum srbds_mitigations srbds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -641,7 +761,7 @@ void update_srbds_msr(void) if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: @@ -655,13 +775,24 @@ void update_srbds_msr(void) break; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_SRBDS)) + srbds_mitigation = SRBDS_MITIGATION_FULL; + else { + srbds_mitigation = SRBDS_MITIGATION_OFF; + return; + } + } /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -675,13 +806,17 @@ static void __init srbds_select_mitigation(void) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -728,6 +863,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -736,7 +872,7 @@ enum gds_mitigations { }; static enum gds_mitigations gds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -752,7 +888,7 @@ bool gds_ucode_mitigated(void) return (gds_mitigation == GDS_MITIGATION_FULL || gds_mitigation == GDS_MITIGATION_FULL_LOCKED); } -EXPORT_SYMBOL_GPL(gds_ucode_mitigated); +EXPORT_SYMBOL_FOR_KVM(gds_ucode_mitigated); void update_gds_msr(void) { @@ -761,7 +897,7 @@ void update_gds_msr(void) switch (gds_mitigation) { case GDS_MITIGATION_OFF: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: @@ -771,23 +907,24 @@ void update_gds_msr(void) * CPUs. */ case GDS_MITIGATION_FULL: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } @@ -800,33 +937,29 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } - if (cpu_mitigations_off()) - gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_GDS)) + gds_mitigation = GDS_MITIGATION_FULL; + else + gds_mitigation = GDS_MITIGATION_OFF; + } /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); @@ -840,9 +973,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -903,10 +1052,17 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; + + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V1)) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -956,11 +1112,37 @@ early_param("nospectre_v1", nospectre_v1_cmdline); enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; +/* Depends on spectre_v2 mitigation selected already */ +static inline bool cdt_possible(enum spectre_v2_mitigation mode) +{ + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) || + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) + return false; + + if (mode == SPECTRE_V2_RETPOLINE || + mode == SPECTRE_V2_EIBRS_RETPOLINE) + return true; + + return false; +} + #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_AUTO, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static enum its_mitigation its_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF; + enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -968,14 +1150,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -986,12 +1160,25 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, + SRSO_MITIGATION_UCODE_NEEDED, + SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_NOSMT, + SRSO_MITIGATION_SAFE_RET, + SRSO_MITIGATION_IBPB, + SRSO_MITIGATION_IBPB_ON_VMEXIT, + SRSO_MITIGATION_BP_SPEC_REDUCE, +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; + static int __init retbleed_parse_cmdline(char *str) { if (!str) @@ -1005,15 +1192,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1034,77 +1221,128 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; - } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + if (!should_mitigate_vuln(X86_BUG_RETBLEED)) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + return; + } - goto do_cmd_auto; - } - break; + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + /* Final mitigation depends on spectre-v2 selection */ + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + else if (boot_cpu_has(X86_FEATURE_IBRS)) + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED)) + return; - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ + /* ITS can also enable stuffing */ + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - break; + /* If SRSO is using IBPB, that works for retbleed too */ + if (srso_mitigation == SRSO_MITIGATION_IBPB) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF && + !cdt_possible(spectre_v2_enabled)) { + pr_err("WARNING: retbleed=stuff depends on retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } + + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) { + if (retbleed_mitigation != RETBLEED_MITIGATION_NONE) + pr_err(RETBLEED_INTEL_MSG); + + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } + } } + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} + +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1115,6 +1353,8 @@ do_cmd_auto: case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + mitigate_smt = true; /* * IBPB on entry already obviates the need for @@ -1124,11 +1364,8 @@ do_cmd_auto: setup_clear_cpu_cap(X86_FEATURE_UNRET); setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - mitigate_smt = true; - /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -1140,7 +1377,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1148,30 +1385,246 @@ do_cmd_auto: } if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && - (retbleed_nosmt || cpu_mitigations_auto_nosmt())) + (retbleed_nosmt || smt_mitigations == SMT_MITIGATIONS_ON)) cpu_smt_disable(false); +} - /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_mitigation = ITS_MITIGATION_OFF; + } else if (!strcmp(str, "on")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } else if (!strcmp(str, "force")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + } else if (!strcmp(str, "stuff")) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS)) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_ITS)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + else + its_mitigation = ITS_MITIGATION_OFF; + } + + if (its_mitigation == ITS_MITIGATION_OFF) + return; + + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("RSB stuff mitigation not supported, using default\n"); + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } + + if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY && + !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; +} + +static void __init its_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS)) + return; + + switch (spectre_v2_enabled) { + case SPECTRE_V2_NONE: + if (its_mitigation != ITS_MITIGATION_OFF) + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_EIBRS_RETPOLINE: + /* Retpoline+CDT mitigates ITS */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + break; + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + default: + break; + } + + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !cdt_possible(spectre_v2_enabled)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + + pr_info("%s\n", its_strings[its_mitigation]); +} + +static void __init its_apply_mitigation(void) +{ + switch (its_mitigation) { + case ITS_MITIGATION_OFF: + case ITS_MITIGATION_AUTO: + case ITS_MITIGATION_VMEXIT_ONLY: + break; + case ITS_MITIGATION_ALIGNED_THUNKS: + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + case ITS_MITIGATION_RETPOLINE_STUFF: + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); + set_return_thunk(call_depth_return_thunk); + break; + } +} + +#undef pr_fmt +#define pr_fmt(fmt) "Transient Scheduler Attacks: " fmt + +enum tsa_mitigations { + TSA_MITIGATION_NONE, + TSA_MITIGATION_AUTO, + TSA_MITIGATION_UCODE_NEEDED, + TSA_MITIGATION_USER_KERNEL, + TSA_MITIGATION_VM, + TSA_MITIGATION_FULL, +}; + +static const char * const tsa_strings[] = { + [TSA_MITIGATION_NONE] = "Vulnerable", + [TSA_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [TSA_MITIGATION_USER_KERNEL] = "Mitigation: Clear CPU buffers: user/kernel boundary", + [TSA_MITIGATION_VM] = "Mitigation: Clear CPU buffers: VM", + [TSA_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", +}; + +static enum tsa_mitigations tsa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TSA) ? TSA_MITIGATION_AUTO : TSA_MITIGATION_NONE; + +static int __init tsa_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + tsa_mitigation = TSA_MITIGATION_NONE; + else if (!strcmp(str, "on")) + tsa_mitigation = TSA_MITIGATION_FULL; + else if (!strcmp(str, "user")) + tsa_mitigation = TSA_MITIGATION_USER_KERNEL; + else if (!strcmp(str, "vm")) + tsa_mitigation = TSA_MITIGATION_VM; + else + pr_err("Ignoring unknown tsa=%s option.\n", str); + + return 0; +} +early_param("tsa", tsa_parse_cmdline); + +static void __init tsa_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_TSA)) { + tsa_mitigation = TSA_MITIGATION_NONE; + return; + } + + if (tsa_mitigation == TSA_MITIGATION_AUTO) { + bool vm = false, uk = false; + + tsa_mitigation = TSA_MITIGATION_NONE; + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) || + cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER)) { + tsa_mitigation = TSA_MITIGATION_USER_KERNEL; + uk = true; + } + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) { + tsa_mitigation = TSA_MITIGATION_VM; + vm = true; } + + if (uk && vm) + tsa_mitigation = TSA_MITIGATION_FULL; } - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); + if (tsa_mitigation == TSA_MITIGATION_NONE) + return; + + if (!boot_cpu_has(X86_FEATURE_VERW_CLEAR)) + tsa_mitigation = TSA_MITIGATION_UCODE_NEEDED; + + /* + * No need to set verw_clear_cpu_buf_mitigation_selected - it + * doesn't fit all cases here and it is not needed because this + * is the only VERW-based mitigation on AMD. + */ + pr_info("%s\n", tsa_strings[tsa_mitigation]); +} + +static void __init tsa_apply_mitigation(void) +{ + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + break; + case TSA_MITIGATION_VM: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + break; + case TSA_MITIGATION_FULL: + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF_VM); + break; + default: + break; + } } #undef pr_fmt @@ -1230,13 +1683,6 @@ void unpriv_ebpf_notify(int new_state) } #endif -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt); - - return len == arglen && !strncmp(arg, opt, len); -} - /* The kernel command line selection for spectre v2 */ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_NONE, @@ -1251,7 +1697,10 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; -enum spectre_v2_user_cmd { +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; + +enum spectre_v2_user_mitigation_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, @@ -1261,6 +1710,9 @@ enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; +static enum spectre_v2_user_mitigation_cmd spectre_v2_user_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", @@ -1269,124 +1721,110 @@ static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; -static const struct { - const char *option; - enum spectre_v2_user_cmd cmd; - bool secure; -} v2_user_options[] __initconst = { - { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, - { "off", SPECTRE_V2_USER_CMD_NONE, false }, - { "on", SPECTRE_V2_USER_CMD_FORCE, true }, - { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, - { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, - { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, - { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, -}; - -static void __init spec_v2_user_print_cond(const char *reason, bool secure) -{ - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("spectre_v2_user=%s forced on command line.\n", reason); -} - -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static int __init spectre_v2_user_parse_cmdline(char *str) { - char arg[20]; - int ret, i; - - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: - return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } - - ret = cmdline_find_option(boot_command_line, "spectre_v2_user", - arg, sizeof(arg)); - if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + if (!str) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { - if (match_option(arg, ret, v2_user_options[i].option)) { - spec_v2_user_print_cond(v2_user_options[i].option, - v2_user_options[i].secure); - return v2_user_options[i].cmd; - } - } + if (!strcmp(str, "auto")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_AUTO; + else if (!strcmp(str, "off")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_NONE; + else if (!strcmp(str, "on")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_FORCE; + else if (!strcmp(str, "prctl")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL; + else if (!strcmp(str, "prctl,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_PRCTL_IBPB; + else if (!strcmp(str, "seccomp")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP; + else if (!strcmp(str, "seccomp,ibpb")) + spectre_v2_user_cmd = SPECTRE_V2_USER_CMD_SECCOMP_IBPB; + else + pr_err("Ignoring unknown spectre_v2_user option (%s).", str); - pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + return 0; } +early_param("spectre_v2_user", spectre_v2_user_parse_cmdline); static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - bool smt_possible = IS_ENABLED(CONFIG_SMP); - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) - smt_possible = false; - - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_user_cmd) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2_USER)) + break; + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + if (smt_mitigations == SMT_MITIGATIONS_OFF) + break; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1402,151 +1840,106 @@ spectre_v2_user_select_mitigation(void) * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || - !smt_possible || + !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} - spectre_v2_user_stibp = mode; +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } + + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", - [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", + [SPECTRE_V2_LFENCE] = "Vulnerable: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; -static const struct { - const char *option; - enum spectre_v2_mitigation_cmd cmd; - bool secure; -} mitigation_options[] __initconst = { - { "off", SPECTRE_V2_CMD_NONE, false }, - { "on", SPECTRE_V2_CMD_FORCE, true }, - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, - { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, - { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, - { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, - { "auto", SPECTRE_V2_CMD_AUTO, false }, - { "ibrs", SPECTRE_V2_CMD_IBRS, false }, -}; +static bool nospectre_v2 __ro_after_init; -static void __init spec_v2_print_cond(const char *reason, bool secure) +static int __init nospectre_v2_parse_cmdline(char *str) { - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) - pr_info("%s selected on command line.\n", reason); + nospectre_v2 = true; + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + return 0; } +early_param("nospectre_v2", nospectre_v2_parse_cmdline); -static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +static int __init spectre_v2_parse_cmdline(char *str) { - enum spectre_v2_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || - cpu_mitigations_off()) - return SPECTRE_V2_CMD_NONE; - - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); - if (ret < 0) - return cmd; - - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { - if (!match_option(arg, ret, mitigation_options[i].option)) - continue; - cmd = mitigation_options[i].cmd; - break; - } - - if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE || - cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_EIBRS || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || - cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && - !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { - pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { - pr_err("%s selected but not compiled in. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } - - if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (!str) + return -EINVAL; - if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { - pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; - } + if (nospectre_v2) + return 0; - if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { - pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", - mitigation_options[i].option); - return SPECTRE_V2_CMD_AUTO; + if (!strcmp(str, "off")) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; + } else if (!strcmp(str, "on")) { + spectre_v2_cmd = SPECTRE_V2_CMD_FORCE; + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } else if (!strcmp(str, "retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE; + } else if (!strcmp(str, "retpoline,amd") || + !strcmp(str, "retpoline,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_LFENCE; + } else if (!strcmp(str, "retpoline,generic")) { + spectre_v2_cmd = SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (!strcmp(str, "eibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS; + } else if (!strcmp(str, "eibrs,lfence")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_LFENCE; + } else if (!strcmp(str, "eibrs,retpoline")) { + spectre_v2_cmd = SPECTRE_V2_CMD_EIBRS_RETPOLINE; + } else if (!strcmp(str, "auto")) { + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } else if (!strcmp(str, "ibrs")) { + spectre_v2_cmd = SPECTRE_V2_CMD_IBRS; + } else { + pr_err("Ignoring unknown spectre_v2 option (%s).", str); } - spec_v2_print_cond(mitigation_options[i].option, - mitigation_options[i].secure); - return cmd; + return 0; } +early_param("spectre_v2", spectre_v2_parse_cmdline); static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { @@ -1579,51 +1972,54 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: - return; + break; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); - return; - } + break; - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } } /* @@ -1644,12 +2040,13 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, BHI_MITIGATION_ON, BHI_MITIGATION_VMEXIT_ONLY, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1671,6 +2068,30 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { + if (!boot_cpu_has(X86_BUG_BHI)) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation != BHI_MITIGATION_AUTO) + return; + + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST)) { + if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL)) + bhi_mitigation = BHI_MITIGATION_ON; + else + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY; + } else { + bhi_mitigation = BHI_MITIGATION_OFF; + } +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ if (bhi_mitigation == BHI_MITIGATION_OFF) return; @@ -1682,95 +2103,148 @@ static void __init bhi_select_mitigation(void) return; } - /* Mitigate in hardware if supported */ - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); return; } pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); } static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { + pr_err("RETPOLINE selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + if ((spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + pr_err("EIBRS selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if ((spectre_v2_cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || + spectre_v2_cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && + !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { + pr_err("IBRS selected but not compiled in. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("IBRS selected but not Intel CPU. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { + pr_err("IBRS selected but CPU doesn't have IBRS. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (spectre_v2_cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { + pr_err("IBRS selected but running as XenPV guest. Switching to AUTO select\n"); + spectre_v2_cmd = SPECTRE_V2_CMD_AUTO; + } + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) { + spectre_v2_cmd = SPECTRE_V2_CMD_NONE; return; + } - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; - case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: - if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; + if (!should_mitigate_vuln(X86_BUG_SPECTRE_V2)) break; - } - - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; + fallthrough; + case SPECTRE_V2_CMD_FORCE: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} + +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO && + !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } + + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1779,8 +2253,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1806,59 +2282,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - if (boot_cpu_has(X86_BUG_BHI)) - bhi_select_mitigation(); - - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1866,28 +2295,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -1940,86 +2367,18 @@ static void update_mds_branch_idle(void) return; if (sched_smt_active()) { - static_branch_enable(&mds_idle_clear); + static_branch_enable(&cpu_buf_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { - static_branch_disable(&mds_idle_clear); - } -} - -#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" -#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" -#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" - -void cpu_bugs_smt_update(void) -{ - mutex_lock(&spec_ctrl_mutex); - - if (sched_smt_active() && unprivileged_ebpf_enabled() && - spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) - pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); - - switch (spectre_v2_user_stibp) { - case SPECTRE_V2_USER_NONE: - break; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - update_stibp_strict(); - break; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: - update_indir_branch_cond(); - break; - } - - switch (mds_mitigation) { - case MDS_MITIGATION_FULL: - case MDS_MITIGATION_VMWERV: - if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) - pr_warn_once(MDS_MSG_SMT); - update_mds_branch_idle(); - break; - case MDS_MITIGATION_OFF: - break; + static_branch_disable(&cpu_buf_idle_clear); } - - switch (taa_mitigation) { - case TAA_MITIGATION_VERW: - case TAA_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(TAA_MSG_SMT); - break; - case TAA_MITIGATION_TSX_DISABLED: - case TAA_MITIGATION_OFF: - break; - } - - switch (mmio_mitigation) { - case MMIO_MITIGATION_VERW: - case MMIO_MITIGATION_UCODE_NEEDED: - if (sched_smt_active()) - pr_warn_once(MMIO_MSG_SMT); - break; - case MMIO_MITIGATION_OFF: - break; - } - - mutex_unlock(&spec_ctrl_mutex); } #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt -static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; - -/* The kernel command line selection */ -enum ssb_mitigation_cmd { - SPEC_STORE_BYPASS_CMD_NONE, - SPEC_STORE_BYPASS_CMD_AUTO, - SPEC_STORE_BYPASS_CMD_ON, - SPEC_STORE_BYPASS_CMD_PRCTL, - SPEC_STORE_BYPASS_CMD_SECCOMP, -}; +static enum ssb_mitigation ssb_mode __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SSB) ? SPEC_STORE_BYPASS_AUTO : SPEC_STORE_BYPASS_NONE; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", @@ -2028,94 +2387,72 @@ static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; -static const struct { - const char *option; - enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initconst = { - { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ - { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ - { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ - { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ - { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ -}; +static bool nossb __ro_after_init; -static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +static int __init nossb_parse_cmdline(char *str) { - enum ssb_mitigation_cmd cmd; - char arg[20]; - int ret, i; - - cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? - SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || - cpu_mitigations_off()) { - return SPEC_STORE_BYPASS_CMD_NONE; - } else { - ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", - arg, sizeof(arg)); - if (ret < 0) - return cmd; + nossb = true; + ssb_mode = SPEC_STORE_BYPASS_NONE; + return 0; +} +early_param("nospec_store_bypass_disable", nossb_parse_cmdline); - for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { - if (!match_option(arg, ret, ssb_mitigation_options[i].option)) - continue; +static int __init ssb_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; - cmd = ssb_mitigation_options[i].cmd; - break; - } + if (nossb) + return 0; - if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to default mode\n", arg); - return cmd; - } - } + if (!strcmp(str, "auto")) + ssb_mode = SPEC_STORE_BYPASS_AUTO; + else if (!strcmp(str, "on")) + ssb_mode = SPEC_STORE_BYPASS_DISABLE; + else if (!strcmp(str, "off")) + ssb_mode = SPEC_STORE_BYPASS_NONE; + else if (!strcmp(str, "prctl")) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; + else if (!strcmp(str, "seccomp")) + ssb_mode = IS_ENABLED(CONFIG_SECCOMP) ? + SPEC_STORE_BYPASS_SECCOMP : SPEC_STORE_BYPASS_PRCTL; + else + pr_err("Ignoring unknown spec_store_bypass_disable option (%s).\n", + str); - return cmd; + return 0; } +early_param("spec_store_bypass_disable", ssb_parse_cmdline); -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; - enum ssb_mitigation_cmd cmd; - - if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; - - cmd = ssb_parse_cmdline(); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && - (cmd == SPEC_STORE_BYPASS_CMD_NONE || - cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) { + ssb_mode = SPEC_STORE_BYPASS_NONE; + return; + } - switch (cmd) { - case SPEC_STORE_BYPASS_CMD_SECCOMP: - /* - * Choose prctl+seccomp as the default mode if seccomp is - * enabled. - */ - if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + if (ssb_mode == SPEC_STORE_BYPASS_AUTO) { + if (should_mitigate_vuln(X86_BUG_SPEC_STORE_BYPASS)) + ssb_mode = SPEC_STORE_BYPASS_PRCTL; else - mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; - break; - case SPEC_STORE_BYPASS_CMD_AUTO: - case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; - break; - case SPEC_STORE_BYPASS_CMD_NONE: - break; + ssb_mode = SPEC_STORE_BYPASS_NONE; } + if (!boot_cpu_has(X86_FEATURE_SSBD)) + ssb_mode = SPEC_STORE_BYPASS_NONE; + + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2129,16 +2466,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2330,6 +2657,7 @@ static int ssb_prctl_get(struct task_struct *task) return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: + case SPEC_STORE_BYPASS_AUTO: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) @@ -2387,19 +2715,17 @@ void x86_spec_ctrl_setup_ap(void) } bool itlb_multihit_kvm_mitigation; -EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); +EXPORT_SYMBOL_FOR_KVM(itlb_multihit_kvm_mitigation); #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; -#if IS_ENABLED(CONFIG_KVM_INTEL) -EXPORT_SYMBOL_GPL(l1tf_mitigation); -#endif + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; +EXPORT_SYMBOL_FOR_KVM(l1tf_mitigation); enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; -EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); +EXPORT_SYMBOL_FOR_KVM(l1tf_vmx_mitigation); /* * These CPUs all support 44bits physical address space internally in the @@ -2442,15 +2768,31 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { - u64 half_pa; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } - if (!boot_cpu_has_bug(X86_BUG_L1TF)) + if (l1tf_mitigation != L1TF_MITIGATION_AUTO) return; - if (cpu_mitigations_off()) + if (!should_mitigate_vuln(X86_BUG_L1TF)) { l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) + return; + } + + if (smt_mitigations == SMT_MITIGATIONS_ON) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; +} + +static void __init l1tf_apply_mitigation(void) +{ + u64 half_pa; + + if (!boot_cpu_has_bug(X86_BUG_L1TF)) + return; override_cache_bits(&boot_cpu_data); @@ -2458,6 +2800,7 @@ static void __init l1tf_select_mitigation(void) case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2515,52 +2858,33 @@ early_param("l1tf", l1tf_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt -enum srso_mitigation { - SRSO_MITIGATION_NONE, - SRSO_MITIGATION_UCODE_NEEDED, - SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, - SRSO_MITIGATION_MICROCODE, - SRSO_MITIGATION_SAFE_RET, - SRSO_MITIGATION_IBPB, - SRSO_MITIGATION_IBPB_ON_VMEXIT, -}; - -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", + [SRSO_MITIGATION_NOSMT] = "Mitigation: SMT disabled", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only", + [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; - static int __init srso_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2572,120 +2896,428 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - - if (!boot_cpu_has_bug(X86_BUG_SRSO) || - cpu_mitigations_off() || - srso_cmd == SRSO_CMD_OFF) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; + if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + srso_mitigation = SRSO_MITIGATION_NONE; return; } - if (has_microcode) { + if (srso_mitigation == SRSO_MITIGATION_AUTO) { /* - * Zen1/2 with SMT off aren't vulnerable after the right - * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. + * Use safe-RET if user->kernel or guest->host protection is + * required. Otherwise the 'microcode' mitigation is sufficient + * to protect the user->user and guest->guest vectors. */ - if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { - setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + if (cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_HOST) || + (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_KERNEL) && + !boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))) { + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else if (cpu_attack_vector_mitigated(CPU_MITIGATE_USER_USER) || + cpu_attack_vector_mitigated(CPU_MITIGATE_GUEST_GUEST)) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + } else { + srso_mitigation = SRSO_MITIGATION_NONE; return; } + } - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; - } - } else { + /* Zen1/2 with SMT off aren't vulnerable to SRSO. */ + if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { + srso_mitigation = SRSO_MITIGATION_NOSMT; + return; + } + + if (!boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; + /* + * Safe-RET provides partial mitigation without microcode, but + * other mitigations require microcode to provide any + * mitigations. + */ + if (srso_mitigation == SRSO_MITIGATION_SAFE_RET) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; + else + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + goto ibpb_on_vmexit; } - break; - case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } break; - - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - srso_mitigation = SRSO_MITIGATION_IBPB; - - /* - * IBPB on entry already obviates the need for - * software-based untraining so clear those in case some - * other mitigation like Retbleed has selected them. - */ - setup_clear_cpu_cap(X86_FEATURE_UNRET); - setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - } - } else { +ibpb_on_vmexit: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { + pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); + srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; + break; + } + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } break; + default: + break; + } +} - case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; +static void __init srso_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SRSO)) + return; - /* - * There is no need for RSB filling: entry_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ + /* + * Clear the feature flag if this mitigation is not selected as that + * feature flag controls the BpSpecReduce MSR bit toggling in KVM. + */ + if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) + setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); + + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + set_return_thunk(srso_alias_return_thunk); } else { - pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); - } + setup_force_cpu_cap(X86_FEATURE_SRSO); + set_return_thunk(srso_return_thunk); + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); break; default: break; } +} -out: - pr_info("%s\n", srso_strings[srso_mitigation]); +#undef pr_fmt +#define pr_fmt(fmt) "VMSCAPE: " fmt + +enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, +}; + +static const char * const vmscape_strings[] = { + [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ + [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", + [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", +}; + +static enum vmscape_mitigations vmscape_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_VMSCAPE) ? VMSCAPE_MITIGATION_AUTO : VMSCAPE_MITIGATION_NONE; + +static int __init vmscape_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } else if (!strcmp(str, "ibpb")) { + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); + vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } + + return 0; +} +early_param("vmscape", vmscape_parse_cmdline); + +static void __init vmscape_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || + !boot_cpu_has(X86_FEATURE_IBPB)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + + if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { + if (should_mitigate_vuln(X86_BUG_VMSCAPE)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + } +} + +static void __init vmscape_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB || + srso_mitigation == SRSO_MITIGATION_IBPB_ON_VMEXIT) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_ON_VMEXIT; + + pr_info("%s\n", vmscape_strings[vmscape_mitigation]); +} + +static void __init vmscape_apply_mitigation(void) +{ + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) + setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); } #undef pr_fmt #define pr_fmt(fmt) fmt +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" +#define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" +#define VMSCAPE_MSG_SMT "VMSCAPE: SMT on, STIBP is required for full protection. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/vmscape.html for more details.\n" + +void cpu_bugs_smt_update(void) +{ + mutex_lock(&spec_ctrl_mutex); + + if (sched_smt_active() && unprivileged_ebpf_enabled() && + spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) + pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); + + switch (spectre_v2_user_stibp) { + case SPECTRE_V2_USER_NONE: + break; + case SPECTRE_V2_USER_STRICT: + case SPECTRE_V2_USER_STRICT_PREFERRED: + update_stibp_strict(); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + update_indir_branch_cond(); + break; + } + + switch (mds_mitigation) { + case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: + case MDS_MITIGATION_VMWERV: + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) + pr_warn_once(MDS_MSG_SMT); + update_mds_branch_idle(); + break; + case MDS_MITIGATION_OFF: + break; + } + + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + + switch (mmio_mitigation) { + case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: + case MMIO_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(MMIO_MSG_SMT); + break; + case MMIO_MITIGATION_OFF: + break; + } + + switch (tsa_mitigation) { + case TSA_MITIGATION_USER_KERNEL: + case TSA_MITIGATION_VM: + case TSA_MITIGATION_AUTO: + case TSA_MITIGATION_FULL: + /* + * TSA-SQ can potentially lead to info leakage between + * SMT threads. + */ + if (sched_smt_active()) + static_branch_enable(&cpu_buf_idle_clear); + else + static_branch_disable(&cpu_buf_idle_clear); + break; + case TSA_MITIGATION_NONE: + case TSA_MITIGATION_UCODE_NEEDED: + break; + } + + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. + * + * Intel eIBRS (!AUTOIBRS) implies STIBP on. + */ + if (!sched_smt_active() || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + break; + pr_warn_once(VMSCAPE_MSG_SMT); + break; + } + + mutex_unlock(&spec_ctrl_mutex); +} + +void __init cpu_select_mitigations(void) +{ + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* + * Previously running kernel (kexec), may have some controls + * turned ON. Clear them and let the mitigations setup below + * rediscover them based on configuration. + */ + x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; + } + + x86_arch_cap_msr = x86_read_arch_cap_msr(); + + cpu_print_attack_vectors(); + + /* Select the proper CPU mitigations before patching alternatives: */ + spectre_v1_select_mitigation(); + spectre_v2_select_mitigation(); + retbleed_select_mitigation(); + spectre_v2_user_select_mitigation(); + ssb_select_mitigation(); + l1tf_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); + srbds_select_mitigation(); + l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); + tsa_select_mitigation(); + vmscape_select_mitigation(); + + /* + * After mitigations are selected, some may need to update their + * choices. + */ + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + vmscape_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); + tsa_apply_mitigation(); + vmscape_apply_mitigation(); +} + #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" @@ -2775,9 +3407,6 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return sysfs_emit(buf, "Unknown: No mitigations\n"); - if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2795,6 +3424,19 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t old_microcode_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return sysfs_emit(buf, "Unknown: running under hypervisor"); + + return sysfs_emit(buf, "Vulnerable\n"); +} + +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2853,7 +3495,7 @@ static const char *spectre_bhi_state(void) !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; return "; BHI: Vulnerable"; @@ -2861,9 +3503,6 @@ static const char *spectre_bhi_state(void) static ssize_t spectre_v2_show_state(char *buf) { - if (spectre_v2_enabled == SPECTRE_V2_LFENCE) - return sysfs_emit(buf, "Vulnerable: LFENCE\n"); - if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); @@ -2908,9 +3547,6 @@ static ssize_t retbleed_show_state(char *buf) static ssize_t srso_show_state(char *buf) { - if (boot_cpu_has(X86_FEATURE_SRSO_NO)) - return sysfs_emit(buf, "Mitigation: SMT disabled\n"); - return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } @@ -2919,6 +3555,16 @@ static ssize_t gds_show_state(char *buf) return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); } +static ssize_t tsa_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", tsa_strings[tsa_mitigation]); +} + +static ssize_t vmscape_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", vmscape_strings[vmscape_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2962,7 +3608,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: - case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -2977,6 +3622,18 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_OLD_MICROCODE: + return old_microcode_show_state(buf); + + case X86_BUG_ITS: + return its_show_state(buf); + + case X86_BUG_TSA: + return tsa_show_state(buf); + + case X86_BUG_VMSCAPE: + return vmscape_show_state(buf); + default: break; } @@ -3031,10 +3688,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); - else - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) @@ -3056,6 +3710,26 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE); +} + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} + +ssize_t cpu_show_tsa(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TSA); +} + +ssize_t cpu_show_vmscape(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_VMSCAPE); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 704e9241b964..dbc99a47be45 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -6,10 +6,12 @@ #include <linux/workqueue.h> #include <linux/delay.h> #include <linux/cpuhotplug.h> +#include <linux/kvm_types.h> #include <asm/cpu_device_id.h> #include <asm/cmdline.h> #include <asm/traps.h> #include <asm/cpu.h> +#include <asm/msr.h> enum split_lock_detect_state { sld_off = 0, @@ -49,7 +51,7 @@ static unsigned int sysctl_sld_mitigate = 1; static DEFINE_SEMAPHORE(buslock_sem, 1); #ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { +static const struct ctl_table sld_sysctls[] = { { .procname = "split_lock_mitigate", .data = &sysctl_sld_mitigate, @@ -95,15 +97,15 @@ static bool split_lock_verify_msr(bool on) { u64 ctrl, tmp; - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl)) return false; if (on) ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; else ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + if (wrmsrq_safe(MSR_TEST_CTRL, ctrl)) return false; - rdmsrl(MSR_TEST_CTRL, tmp); + rdmsrq(MSR_TEST_CTRL, tmp); return ctrl == tmp; } @@ -137,7 +139,7 @@ static void __init __split_lock_setup(void) return; } - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); if (!split_lock_verify_msr(true)) { pr_info("MSR access failed: Disabled\n"); @@ -145,7 +147,7 @@ static void __init __split_lock_setup(void) } /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -162,7 +164,7 @@ static void sld_update_msr(bool on) if (on) test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); + wrmsrq(MSR_TEST_CTRL, test_ctrl_val); } void split_lock_init(void) @@ -192,7 +194,33 @@ static void __split_lock_reenable(struct work_struct *work) { sld_update_msr(true); } -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); +/* + * In order for each CPU to schedule its delayed work independently of the + * others, delayed work struct must be per-CPU. This is not required when + * sysctl_sld_mitigate is enabled because of the semaphore that limits + * the number of simultaneously scheduled delayed works to 1. + */ +static DEFINE_PER_CPU(struct delayed_work, sl_reenable); + +/* + * Per-CPU delayed_work can't be statically initialized properly because + * the struct address is unknown. Thus per-CPU delayed_work structures + * have to be initialized during kernel initialization and after calling + * setup_per_cpu_areas(). + */ +static int __init setup_split_lock_delayed_work(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu); + + INIT_DELAYED_WORK(work, __split_lock_reenable); + } + + return 0; +} +pure_initcall(setup_split_lock_delayed_work); /* * If a CPU goes offline with pending delayed work to re-enable split lock @@ -215,13 +243,14 @@ static void split_lock_warn(unsigned long ip) { struct delayed_work *work; int cpu; + unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate); if (!current->reported_split_lock) pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", current->comm, current->pid, ip); current->reported_split_lock = 1; - if (sysctl_sld_mitigate) { + if (saved_sld_mitigate) { /* * misery factor #1: * sleep 10ms before trying to execute split lock. @@ -234,12 +263,10 @@ static void split_lock_warn(unsigned long ip) */ if (down_interruptible(&buslock_sem) == -EINTR) return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; } cpu = get_cpu(); + work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu); schedule_delayed_work_on(cpu, work, 2); /* Disable split lock detection on this CPU to make progress */ @@ -263,7 +290,7 @@ bool handle_guest_split_lock(unsigned long ip) force_sig_fault(SIGBUS, BUS_ADRALN, NULL); return false; } -EXPORT_SYMBOL_GPL(handle_guest_split_lock); +EXPORT_SYMBOL_FOR_KVM(handle_guest_split_lock); void bus_lock_init(void) { @@ -272,7 +299,7 @@ void bus_lock_init(void) if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + rdmsrq(MSR_IA32_DEBUGCTLMSR, val); if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && (sld_state == sld_warn || sld_state == sld_fatal)) || @@ -286,7 +313,7 @@ void bus_lock_init(void) val |= DEBUGCTLMSR_BUS_LOCK_DETECT; } - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); + wrmsrq(MSR_IA32_DEBUGCTLMSR, val); } bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -350,7 +377,7 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set * it have split lock detection. */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps); if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) goto supported; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 392d09c936d6..51a95b07831f 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -1,38 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Routines to identify caches on Intel CPU. + * x86 CPU caches detection and configuration * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + * Previous changes + * - Venkatesh Pallipadi: Cache identification through CPUID(0x4) + * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure + * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD */ -#include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/sched.h> -#include <linux/capability.h> -#include <linux/sysfs.h> -#include <linux/pci.h> #include <linux/stop_machine.h> -#include <asm/cpufeature.h> +#include <asm/amd/nb.h> #include <asm/cacheinfo.h> -#include <asm/amd_nb.h> -#include <asm/smp.h> +#include <asm/cpufeature.h> +#include <asm/cpuid/api.h> #include <asm/mtrr.h> +#include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 -#define LVL_TRACE 5 - /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -44,214 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask; /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; + enum _cache_type type :5; + unsigned int level :3; + unsigned int is_self_initializing :1; + unsigned int is_fully_associative :1; + unsigned int reserved :4; + unsigned int num_threads_sharing :12; + unsigned int num_cores_on_die :6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; + unsigned int coherency_line_size :12; + unsigned int physical_line_partition :10; + unsigned int ways_of_associativity :10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { - unsigned int number_of_sets:32; + unsigned int number_of_sets :32; } split; u32 full; }; -struct _cpuid4_info_regs { +struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned int id; unsigned long size; - struct amd_northbridge *nb; }; -static unsigned short num_cache_leaves; +/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */ +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +/* + * Fallback AMD CPUID(0x4) emulation + * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d) + * + * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should + * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006). + */ -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. +#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff +#define AMD_L2_L3_INVALID_ASSOC 0x9 - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ union l1_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; + unsigned line_size :8; + unsigned lines_per_tag :8; + unsigned assoc :8; + unsigned size_in_kb :8; }; - unsigned val; + unsigned int val; }; union l2_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned size_in_kb :16; }; - unsigned val; + unsigned int val; }; union l3_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned res :2; + unsigned size_encoded :14; }; - unsigned val; + unsigned int val; }; +/* L2/L3 associativity mapping */ static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ + [1] = 1, + [2] = 2, + [3] = 3, + [4] = 4, + [5] = 6, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE }; static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; - -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) { - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; + unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d, *l1; union l2_cache l2; union l3_cache l3; - union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; @@ -260,629 +163,333 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - switch (leaf) { + l1 = &l1d; + switch (index) { case 1: l1 = &l1i; fallthrough; case 0: if (!l1->val) return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; + + assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; break; case 2: - if (!l2.val) + if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + + /* Use x86_cache_size as it might have K7 errata fixes */ + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: - if (!l3.val) + if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; + + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; } break; default: return; } - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = topology_num_cores_per_package(); - + eax->split.is_self_initializing = 1; + eax->split.type = types[index]; + eax->split.level = levels[index]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = topology_num_cores_per_package(); - if (assoc == 0xffff) + if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE) eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; } -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) +static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax, + union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx) { - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; + if (eax.split.type == CTYPE_NULL) + return -EIO; - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; + id4->eax = eax; + id4->ebx = ebx; + id4->ecx = ecx; + id4->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) +static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) -{ - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); + if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored); + else + legacy_amd_cpuid4(index, &eax, &ebx, &ecx); - return &cache_private_group; + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int node; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored); - node = topology_amd_node_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +static int fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } + u8 cpu_vendor = boot_cpu_data.x86_vendor; - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; + return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ? + amd_fill_cpuid4_info(index, id4) : + intel_fill_cpuid4_info(index, id4); } static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + /* Do a CPUID(op) loop to calculate num_cache_leaves */ + op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4; do { ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CTYPE_NULL); return i; } +/* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + +/* + * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. + */ + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; if (c->x86 < 0x17) { - /* LLC is at the node level. */ + /* Pre-Zen: LLC is at the node level */ c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Family 17h up to 1F models: LLC is at the core + * complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } else { /* - * LLC ID is calculated from the number of threads sharing the - * cache. - * */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; + * Newer families: LLC ID is calculated from the number + * of threads sharing the L3 cache. + */ u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> bits; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Hygons are similar to AMD Family 17h up to 1F models: LLC is + * at the core complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } void init_amd_cacheinfo(struct cpuinfo_x86 *c) { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - num_cache_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) + ci->num_leaves = find_num_cache_leaves(c); + else if (c->extended_cpuid_level >= 0x80000006) + ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3; } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) { - num_cache_leaves = find_num_cache_leaves(c); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + + ci->num_leaves = find_num_cache_leaves(c); } -void init_intel_cacheinfo(struct cpuinfo_x86 *c) +static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3, + unsigned int l2, unsigned int l1i, unsigned int l1d) { - /* Cache sizes */ - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; + /* + * If llc_id is still unset, then cpuid_level < 4, which implies + * that the only possibility left is SMT. Since CPUID(0x2) doesn't + * specify any shared caches and SMT shares all caches, we can + * unconditionally set LLC ID to the package ID so that all + * threads share it. + */ + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; - if (c->cpuid_level > 3) { - static int is_initialized; + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d); - if (is_initialized == 0) { - /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(c); - is_initialized++; - } + if (!l2) + cpu_detect_cache_sizes(c); +} - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < num_cache_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; +/* + * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available. + */ +static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c) +{ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; + if (c->cpuid_level < 2) + return; - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) { + switch (desc->c_type) { + case CACHE_L1_INST: l1i += desc->c_size; break; + case CACHE_L1_DATA: l1d += desc->c_size; break; + case CACHE_L2: l2 += desc->c_size; break; + case CACHE_L3: l3 += desc->c_size; break; } } + + intel_cacheinfo_done(c, l3, l2, l1i, l1d); +} + +static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4) +{ + unsigned int num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + return c->topo.apicid & ~((1 << index_msb) - 1); +} + +static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID; + unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0; + + if (c->cpuid_level < 4) + return false; + /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been previously initialized. */ - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (num_cache_leaves != 0 && c->x86 == 15) - only_trace = 1; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } - } - } + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); - if (new_l1d) - l1d = new_l1d; + if (!ci->num_leaves) + return false; - if (new_l1i) - l1i = new_l1i; + for (int i = 0; i < ci->num_leaves; i++) { + struct _cpuid4_info id4 = {}; + int ret; - if (new_l2) { - l2 = new_l2; - c->topo.llc_id = l2_id; - c->topo.l2c_id = l2_id; - } + ret = intel_fill_cpuid4_info(i, &id4); + if (ret < 0) + continue; - if (new_l3) { - l3 = new_l3; - c->topo.llc_id = l3_id; + switch (id4.eax.split.level) { + case 1: + if (id4.eax.split.type == CTYPE_DATA) + l1d = id4.size / 1024; + else if (id4.eax.split.type == CTYPE_INST) + l1i = id4.size / 1024; + break; + case 2: + l2 = id4.size / 1024; + l2_id = calc_cache_topo_id(c, &id4); + break; + case 3: + l3 = id4.size / 1024; + l3_id = calc_cache_topo_id(c, &id4); + break; + default: + break; + } } - /* - * If llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->topo.pkg_id. - */ - if (c->topo.llc_id == BAD_APICID) - c->topo.llc_id = c->topo.pkg_id; + c->topo.l2c_id = l2_id; + c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); + return true; +} - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */ + if (intel_cacheinfo_0x4(c)) + return; - if (!l2) - cpu_detect_cache_sizes(c); + intel_cacheinfo_0x2(c); } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon + */ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci; - struct cacheinfo *this_leaf; + struct cacheinfo *ci; int i, sibling; /* @@ -894,18 +501,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, this_cpu_ci = get_cpu_cacheinfo(i); if (!this_cpu_ci->info_list) continue; - this_leaf = this_cpu_ci->info_list + index; + + ci = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - nshared = base->eax.split.num_threads_sharing + 1; + nshared = id4->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -919,14 +526,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if ((apicid < first) || (apicid > last)) continue; - this_leaf = this_cpu_ci->info_list + index; + ci = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else @@ -935,25 +541,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, return 1; } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon + */ static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cacheinfo *ci, *sibling_ci; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) { - if (__cache_amd_cpumap_setup(cpu, index, base)) + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, id4)) return; } - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + ci = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &ci->shared_cpu_map); if (num_threads_sharing == 1) return; @@ -963,78 +571,67 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + /* Skip if itself or no cacheinfo */ if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + continue; + + sibling_ci = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &ci->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map); } } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) +static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4, + struct amd_northbridge *nb) { - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; + ci->id = id4->id; + ci->attributes = CACHE_ID; + ci->level = id4->eax.split.level; + ci->type = cache_type_map[id4->eax.split.type]; + ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1; + ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1; + ci->size = id4->size; + ci->number_of_sets = id4->ecx.split.number_of_sets + 1; + ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1; + ci->priv = nb; } int init_cache_level(unsigned int cpu) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - if (!num_cache_leaves) + /* There should be at least one leaf. */ + if (!ci->num_leaves) return -ENOENT; - if (!this_cpu_ci) - return -EINVAL; - this_cpu_ci->num_levels = 3; - this_cpu_ci->num_leaves = num_cache_leaves; - return 0; -} - -/* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->topo.apicid >> index_msb; + return 0; } int populate_cache_leaves(unsigned int cpu) { - unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; + struct cacheinfo *ci = this_cpu_ci->info_list; + u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; + struct amd_northbridge *nb = NULL; + struct _cpuid4_info id4 = {}; + int idx, ret; for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + ret = fill_cpuid4_info(idx, &id4); if (ret) return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); + + id4.id = get_cache_id(apicid, &id4); + + if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) + nb = amd_init_l3_cache(idx); + + ci_info_init(ci++, &id4, nb); + __cache_cpumap_setup(cpu, idx, &id4); } - this_cpu_ci->cpu_map_populated = true; + this_cpu_ci->cpu_map_populated = true; return 0; } @@ -1050,31 +647,33 @@ int populate_cache_leaves(unsigned int cpu) static unsigned long saved_cr4; static DEFINE_RAW_SPINLOCK(cache_disable_lock); +/* + * Cache flushing is the most time-consuming step when programming the + * MTRRs. On many Intel CPUs without known erratas, it can be skipped + * if the CPU declares cache self-snooping support. + */ +static void maybe_flush_caches(void) +{ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + void cache_disable(void) __acquires(cache_disable_lock) { unsigned long cr0; /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots + * This is not ideal since the cache is only flushed/disabled + * for this CPU while the MTRRs are changed, but changing this + * requires more invasive changes to the way the kernel boots. */ - raw_spin_lock(&cache_disable_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_feature_enabled(X86_FEATURE_PGE)) { @@ -1089,9 +688,7 @@ void cache_disable(void) __acquires(cache_disable_lock) if (cpu_feature_enabled(X86_FEATURE_MTRR)) mtrr_disable(); - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); } void cache_enable(void) __releases(cache_disable_lock) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ca327cfa42ae..e7ab22fce3b5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -7,6 +7,7 @@ #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/export.h> +#include <linux/kvm_types.h> #include <linux/percpu.h> #include <linux/string.h> #include <linux/ctype.h> @@ -26,9 +27,11 @@ #include <linux/pgtable.h> #include <linux/stackprotector.h> #include <linux/utsname.h> +#include <linux/efi.h> #include <asm/alternative.h> #include <asm/cmdline.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -76,6 +79,10 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Used for modules: built-in code uses runtime constants */ +unsigned long USER_PTR_MAX; +EXPORT_SYMBOL(USER_PTR_MAX); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ @@ -147,7 +154,7 @@ static void ppin_init(struct cpuinfo_x86 *c) */ info = (struct ppin_info *)id->driver_data; - if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + if (rdmsrq_safe(info->msr_ppin_ctl, &val)) goto clear_ppin; if ((val & 3UL) == 1UL) { @@ -157,19 +164,19 @@ static void ppin_init(struct cpuinfo_x86 *c) /* If PPIN is disabled, try to enable */ if (!(val & 2UL)) { - wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); - rdmsrl_safe(info->msr_ppin_ctl, &val); + wrmsrq_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrq_safe(info->msr_ppin_ctl, &val); } /* Is the enable bit set? */ if (val & 2UL) { - c->ppin = __rdmsr(info->msr_ppin); + c->ppin = native_rdmsrq(info->msr_ppin); set_cpu_cap(c, info->feature); return; } clear_ppin: - clear_cpu_cap(c, info->feature); + setup_clear_cpu_cap(info->feature); } static void default_init(struct cpuinfo_x86 *c) @@ -241,6 +248,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) @@ -320,7 +328,7 @@ static int __init cachesize_setup(char *str) __setup("cachesize=", cachesize_setup); /* Probe for the CPUID instruction */ -bool have_cpuid_p(void) +bool cpuid_feature(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -398,6 +406,28 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +static __always_inline void setup_lass(struct cpuinfo_x86 *c) +{ + if (!cpu_feature_enabled(X86_FEATURE_LASS)) + return; + + /* + * Legacy vsyscall page access causes a #GP when LASS is active. + * Disable LASS because the #GP handler doesn't support vsyscall + * emulation. + * + * Also disable LASS when running under EFI, as some runtime and + * boot services rely on 1:1 mappings in the lower half. + */ + if (IS_ENABLED(CONFIG_X86_VSYSCALL_EMULATION) || + IS_ENABLED(CONFIG_EFI)) { + setup_clear_cpu_cap(X86_FEATURE_LASS); + return; + } + + cr4_set_bits(X86_CR4_LASS); +} + /* These bits should not change their value after CPU init is finished. */ static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; @@ -457,14 +487,14 @@ void cr4_update_irqsoff(unsigned long set, unsigned long clear) __write_cr4(newval); } } -EXPORT_SYMBOL(cr4_update_irqsoff); +EXPORT_SYMBOL_FOR_KVM(cr4_update_irqsoff); /* Read the CR4 shadow. */ unsigned long cr4_read_shadow(void) { return this_cpu_read(cpu_tlbstate.cr4); } -EXPORT_SYMBOL_GPL(cr4_read_shadow); +EXPORT_SYMBOL_FOR_KVM(cr4_read_shadow); void cr4_init(void) { @@ -561,9 +591,9 @@ __noendbr u64 ibt_save(bool disable) u64 msr = 0; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); if (disable) - wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); } return msr; @@ -574,10 +604,10 @@ __noendbr void ibt_restore(u64 save) u64 msr; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); msr &= ~CET_ENDBR_EN; msr |= (save & CET_ENDBR_EN); - wrmsrl(MSR_IA32_S_CET, msr); + wrmsrq(MSR_IA32_S_CET, msr); } } @@ -601,15 +631,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_USER_SHSTK); if (kernel_ibt) - wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN); else - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); cr4_set_bits(X86_CR4_CET); if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); } } @@ -620,8 +650,8 @@ __noendbr void cet_disable(void) cpu_feature_enabled(X86_FEATURE_SHSTK))) return; - wrmsrl(MSR_IA32_S_CET, 0); - wrmsrl(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_U_CET, 0); } /* @@ -636,9 +666,9 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { - { X86_FEATURE_MWAIT, 0x00000005 }, - { X86_FEATURE_DCA, 0x00000009 }, - { X86_FEATURE_XSAVE, 0x0000000d }, + { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT }, + { X86_FEATURE_DCA, CPUID_LEAF_DCA }, + { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE }, { 0, 0 } }; @@ -666,8 +696,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -719,7 +749,7 @@ void load_direct_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -EXPORT_SYMBOL_GPL(load_direct_gdt); +EXPORT_SYMBOL_FOR_KVM(load_direct_gdt); /* Load a fixmap remapping of the per-cpu GDT */ void load_fixmap_gdt(int cpu) @@ -750,9 +780,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * No need to load %gs. It is already correct. * * Writing %gs on 64bit would zero GSBASE which would make any per - * CPU operation up to the point of the wrmsrl() fault. + * CPU operation up to the point of the wrmsrq() fault. * - * Set GSBASE to the new offset. Until the wrmsrl() happens the + * Set GSBASE to the new offset. Until the wrmsrq() happens the * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). @@ -760,7 +790,7 @@ void __init switch_gdt_and_percpu_base(int cpu) * This works even with stackprotector enabled because the * per CPU stack canary is 0 in both per CPU areas. */ - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); + wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else /* * %fs is already set to __KERNEL_PERCPU, but after switching GDT @@ -845,13 +875,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -859,15 +889,13 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1006,25 +1034,22 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* AMD-defined flags: level 0x80000001 */ + /* + * Check if extended CPUID leaves are implemented: Max extended + * CPUID leaf must be in the 0x80000001-0x8000ffff range. + */ eax = cpuid_eax(0x80000000); - c->extended_cpuid_level = eax; + c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0; - if ((eax & 0xffff0000) == 0x80000000) { - if (eax >= 0x80000001) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (c->extended_cpuid_level >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_8000_0001_ECX] = ecx; - c->x86_capability[CPUID_8000_0001_EDX] = edx; - } + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } - if (c->extended_cpuid_level >= 0x80000007) { - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - c->x86_capability[CPUID_8000_0007_EBX] = ebx; - c->x86_power = edx; - } + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); if (c->extended_cpuid_level >= 0x80000008) { cpuid(0x80000008, &eax, &ebx, &ecx, &edx); @@ -1163,7 +1188,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), @@ -1201,8 +1226,11 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define VULNBL(vendor, family, model, blacklist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) -#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \ - X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues) +#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \ + X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues) + +#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \ + X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues) #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1225,51 +1253,77 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) +/* CPU is affected by Transient Scheduler Attacks */ +#define TSA BIT(10) +/* CPU is affected by VMSCAPE */ +#define VMSCAPE BIT(11) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SANDYBRIDGE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_METEORLAKE_L, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_H, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ARROWLAKE_U, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_LUNARLAKE_M, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_SAPPHIRERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_GRANITERAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_EMERALDRAPIDS_X, X86_STEP_MAX, VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS | VMSCAPE), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_CRESTMONT_X, X86_STEP_MAX, VMSCAPE), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), - VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), - VULNBL_AMD(0x19, SRSO), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO | VMSCAPE), + VULNBL_AMD(0x19, SRSO | TSA | VMSCAPE), + VULNBL_AMD(0x1a, SRSO | VMSCAPE), {} }; @@ -1285,7 +1339,7 @@ u64 x86_read_arch_cap_msr(void) u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); return x86_arch_cap_msr; } @@ -1315,10 +1369,78 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + +static struct x86_cpu_id cpu_latest_microcode[] = { +#include "microcode/intel-ucode-defs.h" + {} +}; + +static bool __init cpu_has_old_microcode(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode); + + /* Give unknown CPUs a pass: */ + if (!m) { + /* Intel CPUs should be in the list. Warn if not: */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + pr_info("x86/CPU: Model not found in latest microcode list\n"); + return false; + } + + /* + * Hosts usually lie to guests with a super high microcode + * version. Just ignore what hosts tell guests: + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* Consider all debug microcode to be old: */ + if (boot_cpu_data.microcode & BIT(31)) + return true; + + /* Give new microcode a pass: */ + if (boot_cpu_data.microcode >= m->driver_data) + return false; + + /* Uh oh, too old: */ + return true; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (cpu_has_old_microcode()) { + pr_warn("x86/CPU: Running old microcode\n"); + setup_force_cpu_bug(X86_BUG_OLD_MICROCODE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) @@ -1329,8 +1451,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) { setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && @@ -1397,15 +1521,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. - * - * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, - * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); - else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) - setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { @@ -1434,9 +1553,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); @@ -1444,6 +1566,30 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) { + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (!cpu_has(c, X86_FEATURE_TSA_SQ_NO) || + !cpu_has(c, X86_FEATURE_TSA_L1_NO)) { + if (cpu_matches(cpu_vuln_blacklist, TSA) || + /* Enable bug on Zen guests to allow for live migration. */ + (cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_ZEN))) + setup_force_cpu_bug(X86_BUG_TSA); + } + } + + /* + * Set the bug only on bare-metal. A nested hypervisor should already be + * deploying IBPB to isolate itself from nested guests. + */ + if (cpu_matches(cpu_vuln_blacklist, VMSCAPE) && + !boot_cpu_has(X86_FEATURE_HYPERVISOR)) + setup_force_cpu_bug(X86_BUG_VMSCAPE); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1477,15 +1623,96 @@ static void detect_nopl(void) #endif } +static inline bool parse_set_clear_cpuid(char *arg, bool set) +{ + char *opt; + int taint = 0; + + while (arg) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&arg, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" %d:%d\n", bit >> 5, bit & 31); + else + pr_cont(" %s\n", x86_cap_flags[bit]); + + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + const char *kind; + + if (bit < 32 * NCAPINTS) { + flag = x86_cap_flags[bit]; + kind = "feature"; + } else { + kind = "bug"; + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } + + if (!flag) + continue; + + if (strcmp(flag, opt)) + continue; + + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); + setup_clear_cpu_cap(bit); + } + taint++; + found = true; + break; + } + + if (!found) + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); + } + + return taint; +} + + /* * We parse cpu parameters early because fpu__init_system() is executed * before parse_early_param(). */ static void __init cpu_parse_early_param(void) { + bool cpuid_taint = false; char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; + int arglen; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1517,61 +1744,17 @@ static void __init cpu_parse_early_param(void) setup_clear_cpu_cap(X86_FEATURE_FRED); arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; - - pr_info("Clearing CPUID bits:"); - - while (argptr) { - bool found __maybe_unused = false; - unsigned int bit; - - opt = strsep(&argptr, ","); - - /* - * Handle naked numbers first for feature flags which don't - * have names. - */ - if (!kstrtouint(opt, 10, &bit)) { - if (bit < NCAPINTS * 32) { - - /* empty-string, i.e., ""-defined feature flags */ - if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); - else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); - - setup_clear_cpu_cap(bit); - taint++; - } - /* - * The assumption is that there are no feature names with only - * numbers in the name thus go to the next argument. - */ - continue; - } - - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) - continue; - - if (strcmp(x86_cap_flag(bit), opt)) - continue; - - pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); - taint++; - found = true; - break; - } + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, false); - if (!found) - pr_cont(" (unknown: %s)", opt); - } - pr_cont("\n"); + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, true); - if (taint) + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* @@ -1588,11 +1771,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (have_cpuid_p()) { + if (cpuid_feature()) { cpu_detect(c); get_cpu_vendor(c); intel_unlock_cpuid_leafs(c); @@ -1608,6 +1791,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); + check_cpufeature_deps(c); if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); @@ -1647,17 +1831,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_LA57); detect_nopl(); + mca_bsp_init(c); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1665,20 +1846,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} +void __init early_cpu_init(void) +{ #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); #endif + + init_cpu_devs(); + +#ifdef CONFIG_PROCESSOR_SELECT + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); + } } +#endif + early_identify_cpu(&boot_cpu_data); } @@ -1700,11 +1891,11 @@ static bool detect_null_seg_behavior(void) */ unsigned long old_base, tmp; - rdmsrl(MSR_FS_BASE, old_base); - wrmsrl(MSR_FS_BASE, 1); + rdmsrq(MSR_FS_BASE, old_base); + wrmsrq(MSR_FS_BASE, 1); loadsegment(fs, 0); - rdmsrl(MSR_FS_BASE, tmp); - wrmsrl(MSR_FS_BASE, old_base); + rdmsrq(MSR_FS_BASE, tmp); + wrmsrq(MSR_FS_BASE, old_base); return tmp == 0; } @@ -1745,11 +1936,11 @@ static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) + if (!cpuid_feature()) return; cpu_detect(c); @@ -1843,10 +2034,10 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); - /* Set up SMEP/SMAP/UMIP */ setup_smep(c); setup_smap(c); setup_umip(c); + setup_lass(c); /* Enable FSGSBASE instructions if available. */ if (cpu_has(c, X86_FEATURE_FSGSBASE)) { @@ -1862,6 +2053,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Filter out anything that depends on CPUID levels we don't have */ filter_cpuid_features(c, true); + /* Check for unmet dependencies based on the CPUID dependency table */ + check_cpufeature_deps(c); + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { const char *p; @@ -1930,9 +2124,9 @@ void enable_sep_cpu(void) */ tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1); + wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32); put_cpu(); } @@ -1954,9 +2148,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -1967,6 +2167,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) @@ -1997,29 +2198,42 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); -DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { - .current_task = &init_task, - .preempt_count = INIT_PREEMPT_COUNT, - .top_of_stack = TOP_OF_INIT_STACK, -}; -EXPORT_PER_CPU_SYMBOL(pcpu_hot); -EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + +DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +EXPORT_PER_CPU_SYMBOL(const_current_task); + +DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, - fixed_percpu_data) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); +/* + * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING + * so that this space is reserved in the hot cache section even when the + * mitigation is disabled. + */ +DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); +EXPORT_PER_CPU_SYMBOL(__x86_call_depth); -static void wrmsrl_cstar(unsigned long val) +static void wrmsrq_cstar(unsigned long val) { /* * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR @@ -2027,37 +2241,37 @@ static void wrmsrl_cstar(unsigned long val) * guest. Avoid the pointless write on all Intel CPUs. */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - wrmsrl(MSR_CSTAR, val); + wrmsrq(MSR_CSTAR, val); } static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + wrmsrq_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { - wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ - wrmsrl(MSR_SYSCALL_MASK, + wrmsrq(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| @@ -2081,32 +2295,25 @@ void syscall_init(void) if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); } - -#else /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); #ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif #endif -#endif /* CONFIG_X86_64 */ - -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) +static void initialize_debug_regs(void) { - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } + /* Control register first -- to make sure everything is disabled. */ + set_debugreg(DR7_FIXED_1, 7); + set_debugreg(DR6_RESERVED, 6); + /* dr5 and dr4 don't exist */ + set_debugreg(0, 3); + set_debugreg(0, 2); + set_debugreg(0, 1); + set_debugreg(0, 0); } #ifdef CONFIG_KGDB @@ -2129,7 +2336,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - wrmsr(MSR_TSC_AUX, cpudata, 0); + wrmsrq(MSR_TSC_AUX, cpudata); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -2244,8 +2451,8 @@ void cpu_init(void) memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); + wrmsrq(MSR_FS_BASE, 0); + wrmsrq(MSR_KERNEL_GS_BASE, 0); barrier(); x2apic_setup(); @@ -2267,7 +2474,7 @@ void cpu_init(void) load_mm_ldt(&init_mm); - clear_all_debug_regs(); + initialize_debug_regs(); dbg_restore_debug_regs(); doublefault_init_cpu_tss(); @@ -2380,6 +2587,12 @@ void __init arch_cpu_finalize_init(void) fpu__init_cpu(); /* + * This needs to follow the FPU initializtion, since EFI depends on it. + */ + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi_enter_virtual_mode(); + + /* * Ensure that access to the per CPU representation has the initial * boot CPU configuration. */ @@ -2389,7 +2602,7 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { - unsigned long USER_PTR_MAX = TASK_SIZE_MAX; + USER_PTR_MAX = TASK_SIZE_MAX; /* * Enable this when LAM is gated on LASS support diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1beccefbaff9..5c7a3a71191a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ @@ -50,15 +42,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; #ifdef CONFIG_CPU_SUP_INTEL -enum tsx_ctrl_states { - TSX_CTRL_ENABLE, - TSX_CTRL_DISABLE, - TSX_CTRL_RTM_ALWAYS_ABORT, - TSX_CTRL_NOT_SUPPORTED, -}; - -extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; - extern void __init tsx_init(void); void tsx_ap_init(void); void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); @@ -83,6 +66,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +struct amd_northbridge *amd_init_l3_cache(int index); +#else +static inline struct amd_northbridge *amd_init_l3_cache(int index) +{ + return NULL; +} +#endif + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 8bd84114c2d9..146f6f8b0650 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, @@ -45,6 +46,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, { X86_FEATURE_VAES, X86_FEATURE_AVX }, { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, @@ -70,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, + { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, @@ -77,12 +80,18 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_SGX_LC, X86_FEATURE_SGX }, { X86_FEATURE_SGX1, X86_FEATURE_SGX }, { X86_FEATURE_SGX2, X86_FEATURE_SGX1 }, + { X86_FEATURE_SGX_EUPDATESVN, X86_FEATURE_SGX1 }, { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 }, { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, + { X86_FEATURE_LASS, X86_FEATURE_SMAP }, {} }; @@ -146,3 +155,38 @@ void setup_clear_cpu_cap(unsigned int feature) { do_clear_cpu_cap(NULL, feature); } + +/* + * Return the feature "name" if available, otherwise return + * the X86_FEATURE_* numerals to make it easier to identify + * the feature. + */ +static const char *x86_feature_name(unsigned int feature, char *buf) +{ + if (x86_cap_flags[feature]) + return x86_cap_flags[feature]; + + snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32); + + return buf; +} + +void check_cpufeature_deps(struct cpuinfo_x86 *c) +{ + char feature_buf[16], depends_buf[16]; + const struct cpuid_dep *d; + + for (d = cpuid_deps; d->feature; d++) { + if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) { + /* + * Only warn about the first unmet dependency on the + * first CPU where it is encountered to avoid spamming + * the kernel log. + */ + pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n", + smp_processor_id(), + x86_feature_name(d->feature, feature_buf), + x86_feature_name(d->depends, depends_buf)); + } + } +} diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c new file mode 100644 index 000000000000..89bc8db5e9c6 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sizes.h> + +#include <asm/cpuid/types.h> + +#include "cpu.h" + +#define CACHE_ENTRY(_desc, _type, _size) \ + [_desc] = { \ + .c_type = (_type), \ + .c_size = (_size) / SZ_1K, \ + } + +#define TLB_ENTRY(_desc, _type, _entries) \ + [_desc] = { \ + .t_type = (_type), \ + .entries = (_entries), \ + } + +const struct leaf_0x2_table cpuid_0x2_table[256] = { + CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */ + + TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */ + TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */ + TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */ + TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */ + TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */ + TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */ + TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */ +}; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1..dfec2c61e354 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 10719aba6276..1976fef2dfe5 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); @@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 4a4118784c13..d69757246bde 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) bool enable_vmx; u64 msr; - if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); clear_cpu_cap(c, X86_FEATURE_SGX); return; @@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_SGX_LC_ENABLED; } - wrmsrl(MSR_IA32_FEAT_CTL, msr); + wrmsrq(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f2..1fda6c3a2b65 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -15,6 +15,8 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> +#include <asm/msr.h> +#include <asm/resctrl.h> #include "cpu.h" @@ -96,7 +98,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -110,12 +112,14 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; } } + + resctrl_cpu_detect(c); } static void early_init_hygon(struct cpuinfo_x86 *c) @@ -194,7 +198,7 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -240,26 +244,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b..f3e9219845e8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d1de300af173..98ae4c37c93e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,40 +1,33 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/pgtable.h> -#include <linux/string.h> #include <linux/bitops.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <linux/thread_info.h> #include <linux/init.h> -#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/minmax.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/types.h> + +#ifdef CONFIG_X86_64 +#include <linux/topology.h> +#endif -#include <asm/cpufeature.h> -#include <asm/msr.h> #include <asm/bugs.h> +#include <asm/cpu_device_id.h> +#include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> +#include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> -#include <asm/hwcap2.h> -#include <asm/elf.h> -#include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/numa.h> +#include <asm/resctrl.h> #include <asm/thermal.h> - -#ifdef CONFIG_X86_64 -#include <linux/topology.h> -#endif +#include <asm/uaccess.h> #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#endif - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -166,7 +159,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c) u64 tme_activate; int keyid_bits; - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate); if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); @@ -195,7 +188,7 @@ void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN) return; /* @@ -210,10 +203,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); @@ -256,8 +245,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) #endif /* CPUID workaround for 0F33/0F34 CPU */ - if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) + if (c->x86_vfm == INTEL_P4_PRESCOTT && + (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -266,10 +255,16 @@ static void early_init_intel(struct cpuinfo_x86 *c) * * It is also reliable across cores and sockets. (but not across * cabinets - we turn it off in that case explicitly.) + * + * Use a model-specific check for some older CPUs that have invariant + * TSC but may not report it architecturally via 8000_0007. */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_CEDARMILL) || + (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -298,12 +293,19 @@ static void early_init_intel(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_PAT); /* - * If fast string is not enabled in IA32_MISC_ENABLE for any reason, - * clear the fast string and enhanced fast string CPU capabilities. + * Modern CPUs are generally expected to have a sane fast string + * implementation. However, BIOSes typically have a knob to tweak + * the architectural MISC_ENABLE.FAST_STRING enable bit. + * + * Adhere to the preference and program the Linux-defined fast + * string flag and enhanced fast string capabilities accordingly. */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { + rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + /* X86_FEATURE_ERMS is set based on CPUID */ + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } else { pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); @@ -350,9 +352,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c) int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping < 8) { pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; @@ -369,9 +369,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c) /* * Mask B, Pentium, but not Pentium MMX */ - if (c->x86 == 5 && - c->x86_stepping >= 1 && c->x86_stepping <= 4 && - c->x86_model <= 3) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX && + c->x86_stepping >= 1 && c->x86_stepping <= 4) { /* * Remember we have B step Pentia with bugs */ @@ -398,7 +397,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (c->x86 == 5 && c->x86_model < 9) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); @@ -413,7 +412,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) + if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) || + c->x86_vfm < INTEL_PENTIUM_II_KLAMATH) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -431,7 +431,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { + if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); @@ -445,27 +445,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 && (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); - #ifdef CONFIG_X86_INTEL_USERCOPY /* - * Set up the preferred alignment for movsl bulk memory moves + * MOVSL bulk memory moves can be slow when source and dest are not + * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment. + * + * Set the preferred alignment for Pentium Pro and newer processors, as + * it has only been tested on these. */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ + if (c->x86_vfm >= INTEL_PENTIUM_PRO) movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; - } #endif intel_smp_check(c); @@ -497,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; - if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) { if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); } @@ -507,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) { u64 msr; - if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr)) return; /* Clear all MISC features */ @@ -518,9 +511,28 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) probe_xeon_phi_r3mwait(c); msr = this_cpu_read(msr_misc_features_shadow); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msr); } +/* + * This is a list of Intel CPUs that are known to suffer from downclocking when + * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel + * executes SIMD-optimized code such as cryptography functions or CRCs, it + * should prefer 256-bit (YMM) code to 512-bit (ZMM) code. + */ +static const struct x86_cpu_id zmm_exclusion_list[] = { + X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + X86_MATCH_VFM(INTEL_ICELAKE, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0), + X86_MATCH_VFM(INTEL_TIGERLAKE, 0), + /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ + {}, +}; + static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); @@ -555,14 +567,14 @@ static void init_intel(struct cpuinfo_x86 *c) c->x86_vfm == INTEL_WESTMERE_EX)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); - if (boot_cpu_has(X86_FEATURE_MWAIT) && c->x86_vfm == INTEL_ATOM_GOLDMONT) + if (boot_cpu_has(X86_FEATURE_MWAIT) && + (c->x86_vfm == INTEL_ATOM_GOLDMONT || + c->x86_vfm == INTEL_LUNARLAKE_M)) set_cpu_bug(c, X86_BUG_MONITOR); #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); #else /* * Names for the Pentium II/Celeron processors @@ -597,13 +609,11 @@ static void init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); } - - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); #endif + if (x86_match_cpu(zmm_exclusion_list)) + set_cpu_cap(c, X86_FEATURE_PREFER_YMM); + /* Work around errata */ srat_detect_node(c); @@ -625,191 +635,90 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) * to determine which, so we use a boottime override * for the 512kb model, and assume 256 otherwise. */ - if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0) size = 256; /* * Intel Quark SoC X1000 contains a 4-way set associative * 16K cache with a 16 byte cache line and 256 lines per tag */ - if ((c->x86 == 5) && (c->x86_model == 9)) + if (c->x86_vfm == INTEL_QUARK_X1000) size = 16; return size; } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 - -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 - -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 - -#define TLB_DATA_1G 0x16 - -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 - -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 - -static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, - { 0x00, 0, 0 } -}; - -static void intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const struct leaf_0x2_table *desc) { - unsigned char k; - if (desc == 0) - return; - - /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && - intel_tlb_table[k].descriptor != 0; k++) - ; - - if (intel_tlb_table[k].tlb_type == 0) - return; + short entries = desc->entries; - switch (intel_tlb_table[k].tlb_type) { + switch (desc->t_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; + case TLB_DATA_1G_2M_4M: + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); + fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } static void intel_detect_tlb(struct cpuinfo_x86 *c) { - int i, j, n; - unsigned int regs[4]; - unsigned char *desc = (unsigned char *)regs; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; if (c->cpuid_level < 2) return; - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) - intel_tlb_lookup(desc[j]); - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) + intel_tlb_lookup(desc); } static const struct cpu_dev intel_cpu_dev = { @@ -876,34 +785,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} - -/** - * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU - * - * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. - * If the processor is not hybrid, returns 0. - */ -u32 get_this_hybrid_cpu_native_id(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) & - (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); -} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 30b1d63b97f3..2c56f8730f59 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -75,11 +75,11 @@ static u8 energ_perf_values[] = { [EPB_INDEX_POWERSAVE] = ENERGY_PERF_BIAS_POWERSAVE, }; -static int intel_epb_save(void) +static int intel_epb_save(void *data) { u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); /* * Ensure that saved_epb will always be nonzero after this write even if * the EPB value read from the MSR is 0. @@ -89,12 +89,12 @@ static int intel_epb_save(void) return 0; } -static void intel_epb_restore(void) +static void intel_epb_restore(void *data) { u64 val = this_cpu_read(saved_epb); u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { @@ -111,14 +111,18 @@ static void intel_epb_restore(void) pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); + wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } -static struct syscore_ops intel_epb_syscore_ops = { +static const struct syscore_ops intel_epb_syscore_ops = { .suspend = intel_epb_save, .resume = intel_epb_restore, }; +static struct syscore intel_epb_syscore = { + .ops = &intel_epb_syscore_ops, +}; + static const char * const energy_perf_strings[] = { [EPB_INDEX_PERFORMANCE] = "performance", [EPB_INDEX_BALANCE_PERFORMANCE] = "balance-performance", @@ -135,7 +139,7 @@ static ssize_t energy_perf_bias_show(struct device *dev, u64 epb; int ret; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; @@ -157,11 +161,11 @@ static ssize_t energy_perf_bias_store(struct device *dev, else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) return -EINVAL; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; - ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); if (ret < 0) return ret; @@ -185,7 +189,7 @@ static int intel_epb_online(unsigned int cpu) { struct device *cpu_dev = get_cpu_device(cpu); - intel_epb_restore(); + intel_epb_restore(NULL); if (!cpuhp_tasks_frozen) sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); @@ -199,7 +203,7 @@ static int intel_epb_offline(unsigned int cpu) if (!cpuhp_tasks_frozen) sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); - intel_epb_save(); + intel_epb_save(NULL); return 0; } @@ -230,7 +234,7 @@ static __init int intel_epb_init(void) if (ret < 0) goto err_out_online; - register_syscore_ops(&intel_epb_syscore_ops); + register_syscore(&intel_epb_syscore); return 0; err_out_online: diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 8e7de733320a..6af1e8baeb0f 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -6,7 +6,35 @@ #include <linux/slab.h> /** - * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * x86_match_vendor_cpu_type - helper function to match the hardware defined + * cpu-type for a single entry in the x86_cpu_id + * table. Note, this function does not match the + * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and + * TOPO_CPU_TYPE_PERFORMANCE. + * @c: Pointer to the cpuinfo_x86 structure of the CPU to match. + * @m: Pointer to the x86_cpu_id entry to match against. + * + * Return: true if the cpu-type matches, false otherwise. + */ +static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m) +{ + if (m->type == X86_CPU_TYPE_ANY) + return true; + + /* Hybrid CPUs are special, they are assumed to match all cpu-types */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return true; + + if (c->x86_vendor == X86_VENDOR_INTEL) + return m->type == c->topo.intel_type; + if (c->x86_vendor == X86_VENDOR_AMD) + return m->type == c->topo.amd_type; + + return false; +} + +/** + * x86_match_cpu - match current CPU against an array of x86_cpu_ids * @match: Pointer to array of x86_cpu_ids. Last entry terminated with * {}. * @@ -50,39 +78,21 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; - return m; - } - return NULL; -} -EXPORT_SYMBOL(x86_match_cpu); - -static const struct x86_cpu_desc * -x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - const struct x86_cpu_desc *m; - - for (m = match; m->x86_family | m->x86_model; m++) { - if (c->x86_vendor != m->x86_vendor) - continue; - if (c->x86 != m->x86_family) - continue; - if (c->x86_model != m->x86_model) - continue; - if (c->x86_stepping != m->x86_stepping) + if (!x86_match_vendor_cpu_type(c, m)) continue; return m; } return NULL; } +EXPORT_SYMBOL(x86_match_cpu); -bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +bool x86_match_min_microcode_rev(const struct x86_cpu_id *table) { - const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + const struct x86_cpu_id *res = x86_match_cpu(table); - if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + if (!res || res->driver_data > boot_cpu_data.microcode) return false; return true; } -EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); +EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ca80fff1fea..3f1dda355307 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov <bp@alien8.de> - * - * All MC4_MISCi registers are shared between cores on a node. */ #include <linux/interrupt.h> #include <linux/notifier.h> @@ -20,7 +18,6 @@ #include <linux/smp.h> #include <linux/string.h> -#include <asm/amd_nb.h> #include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> @@ -46,9 +43,6 @@ /* Deferred error settings */ #define MSR_CU_DEF_ERR 0xC0000410 #define MASK_DEF_LVTOFF 0x000000F0 -#define MASK_DEF_INT_TYPE 0x00000006 -#define DEF_LVT_OFF 0x2 -#define DEF_INT_TYPE_APIC 0x2 /* Scalable MCA: */ @@ -57,6 +51,17 @@ static bool thresholding_irq_en; +struct mce_amd_cpu_data { + mce_banks_t thr_intr_banks; + mce_banks_t dfr_intr_banks; + + u32 thr_intr_en: 1, + dfr_intr_en: 1, + __resv: 30; +}; + +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_amd_cpu_data, mce_amd_data); + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -82,6 +87,8 @@ struct smca_bank { const struct smca_hwid *hwid; u32 id; /* Value of MCA_IPID[InstanceId]. */ u8 sysfs_id; /* Value used for sysfs name. */ + u64 paddrv :1, /* Physical Address Valid bit in MCA_CONFIG */ + __reserved :63; }; static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); @@ -221,6 +228,33 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + /* List of threshold blocks within this MCA bank. */ + struct list_head miscj; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -229,9 +263,6 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); */ static DEFINE_PER_CPU(u64, bank_map); -/* Map of banks that have more than MCA_MISC0 available. */ -static DEFINE_PER_CPU(u64, smca_misc_banks_map); - static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -241,30 +272,9 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) -{ - u32 low, high; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - return; - - if (!(low & MCI_CONFIG_MCAX)) - return; - - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) - return; - - if (low & MASK_BLKPTR_LO) - per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank); - -} - static void smca_configure(unsigned int bank, unsigned int cpu) { + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); u8 *bank_counts = this_cpu_ptr(smca_bank_counts); const struct smca_hwid *s_hwid; unsigned int i, hwid_mcatype; @@ -295,16 +305,36 @@ static void smca_configure(unsigned int bank, unsigned int cpu) * APIC based interrupt. First, check that no interrupt has been * set. */ - if ((low & BIT(5)) && !((high >> 5) & 0x3)) + if ((low & BIT(5)) && !((high >> 5) & 0x3) && data->dfr_intr_en) { + __set_bit(bank, data->dfr_intr_banks); high |= BIT(5); + } + + /* + * SMCA Corrected Error Interrupt + * + * MCA_CONFIG[IntPresent] is bit 10, and tells us if the bank can + * send an MCA Thresholding interrupt without the OS initializing + * this feature. This can be used if the threshold limit is managed + * by the platform. + * + * MCA_CONFIG[IntEn] is bit 40 (8 in the high portion of the MSR). + * The OS should set this to inform the platform that the OS is ready + * to handle the MCA Thresholding interrupt. + */ + if ((low & BIT(10)) && data->thr_intr_en) { + __set_bit(bank, data->thr_intr_banks); + high |= BIT(8); + } this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8)); + if (low & MCI_CONFIG_PADDRV) + this_cpu_ptr(smca_banks)[bank].paddrv = 1; + wrmsr(smca_config, low, high); } - smca_set_misc_banks_map(bank, cpu); - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; @@ -327,25 +357,11 @@ static void smca_configure(unsigned int bank, unsigned int cpu) struct thresh_restart { struct threshold_block *b; - int reset; int set_lvt_off; int lvt_off; u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -381,37 +397,37 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) return msr_high_bits & BIT(28); } -static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; + /* + * On SMCA CPUs, LVT offset is programmed at a different MSR, and + * the BIOS provides the value. The original field where LVT offset + * was set is reserved. Return early here: + */ + if (mce_flags.smca) + return false; + if (apic < 0) { pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } if (apic != msr) { - /* - * On SMCA CPUs, LVT offset is programmed at a different MSR, and - * the BIOS provides the value. The original field where LVT offset - * was set is reserved. Return early here: - */ - if (mce_flags.smca) - return 0; - pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } - return 1; + return true; }; -/* Reprogram MCx_MISC MSR behind this threshold bank. */ -static void threshold_restart_bank(void *_tr) +/* Reprogram MCx_MISC MSR behind this threshold block. */ +static void threshold_restart_block(void *_tr) { struct thresh_restart *tr = _tr; u32 hi, lo; @@ -422,13 +438,13 @@ static void threshold_restart_bank(void *_tr) rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) - tr->reset = 1; /* limit cannot be lower than err count */ - - if (tr->reset) { /* reset err count and overflow bit */ - hi = - (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | - (THRESHOLD_MAX - tr->b->threshold_limit); + /* + * Reset error count and overflow bit. + * This is done during init or after handling an interrupt. + */ + if (hi & MASK_OVERFLOW_HI || tr->set_lvt_off) { + hi &= ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI); + hi |= THRESHOLD_MAX - tr->b->threshold_limit; } else if (tr->old_limit) { /* change limit w/o reset */ int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); @@ -460,6 +476,36 @@ static void threshold_restart_bank(void *_tr) wrmsr(tr->b->address, lo, hi); } +static void threshold_restart_bank(unsigned int bank, bool intr_en) +{ + struct threshold_bank **thr_banks = this_cpu_read(threshold_banks); + struct threshold_block *block, *tmp; + struct thresh_restart tr; + + if (!thr_banks || !thr_banks[bank]) + return; + + memset(&tr, 0, sizeof(tr)); + + list_for_each_entry_safe(block, tmp, &thr_banks[bank]->miscj, miscj) { + tr.b = block; + tr.b->interrupt_enable = intr_en; + threshold_restart_block(&tr); + } +} + +/* Try to use the threshold limit reported through APEI. */ +static u16 get_thr_limit(void) +{ + u32 thr_limit = mce_get_apei_thr_limit(); + + /* Fallback to old default if APEI limit is not available. */ + if (!thr_limit) + return THRESHOLD_MAX; + + return min(thr_limit, THRESHOLD_MAX); +} + static void mce_threshold_block_init(struct threshold_block *b, int offset) { struct thresh_restart tr = { @@ -468,8 +514,8 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset) .lvt_off = offset, }; - b->threshold_limit = THRESHOLD_MAX; - threshold_restart_bank(&tr); + b->threshold_limit = get_thr_limit(); + threshold_restart_block(&tr); }; static int setup_APIC_mce_threshold(int reserved, int new) @@ -481,53 +527,6 @@ static int setup_APIC_mce_threshold(int reserved, int new) return reserved; } -static int setup_APIC_deferred_error(int reserved, int new) -{ - if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR, - APIC_EILVT_MSG_FIX, 0)) - return new; - - return reserved; -} - -static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) -{ - u32 low = 0, high = 0; - int def_offset = -1, def_new; - - if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high)) - return; - - def_new = (low & MASK_DEF_LVTOFF) >> 4; - if (!(low & MASK_DEF_LVTOFF)) { - pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n"); - def_new = DEF_LVT_OFF; - low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4); - } - - def_offset = setup_APIC_deferred_error(def_offset, def_new); - if ((def_offset == def_new) && - (deferred_error_int_vector != amd_deferred_error_interrupt)) - deferred_error_int_vector = amd_deferred_error_interrupt; - - if (!mce_flags.smca) - low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC; - - wrmsr(MSR_CU_DEF_ERR, low, high); -} - -static u32 smca_get_block_address(unsigned int bank, unsigned int block, - unsigned int cpu) -{ - if (!block) - return MSR_AMD64_SMCA_MCx_MISC(bank); - - if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank))) - return 0; - - return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); -} - static u32 get_block_address(u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block, unsigned int cpu) @@ -537,8 +536,15 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; - if (mce_flags.smca) - return smca_get_block_address(bank, block, cpu); + if (mce_flags.smca) { + if (!block) + return MSR_AMD64_SMCA_MCx_MISC(bank); + + if (!(low & MASK_BLKPTR_LO)) + return 0; + + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + } /* Fall back to method we used for older processors: */ switch (block) { @@ -556,12 +562,10 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, return addr; } -static int -prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, - int offset, u32 misc_high) +static int prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, + int offset, u32 misc_high) { unsigned int cpu = smp_processor_id(); - u32 smca_low, smca_high; struct threshold_block b; int new; @@ -578,20 +582,13 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, if (!b.interrupt_capable) goto done; + __set_bit(bank, this_cpu_ptr(&mce_amd_data)->thr_intr_banks); b.interrupt_enable = 1; - if (!mce_flags.smca) { - new = (misc_high & MASK_LVTOFF_HI) >> 20; - goto set_offset; - } - - /* Gather LVT offset for thresholding: */ - if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high)) - goto out; - - new = (smca_low & SMCA_THR_LVT_OFF) >> 12; + if (mce_flags.smca) + goto done; -set_offset: + new = (misc_high & MASK_LVTOFF_HI) >> 20; offset = setup_APIC_mce_threshold(offset, new); if (offset == new) thresholding_irq_en = true; @@ -599,7 +596,6 @@ set_offset: done: mce_threshold_block_init(&b, offset); -out: return offset; } @@ -652,12 +648,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) return; } - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrq(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + wrmsrq(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ for (i = 0; i < num_msrs; i++) @@ -665,7 +661,55 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* restore old settings */ if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); + wrmsrq(MSR_K7_HWCR, hwcr); +} + +static void amd_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; +} + +/* + * Enable the APIC LVT interrupt vectors once per-CPU. This should be done before hardware is + * ready to send interrupts. + * + * Individual error sources are enabled later during per-bank init. + */ +static void smca_enable_interrupt_vectors(void) +{ + struct mce_amd_cpu_data *data = this_cpu_ptr(&mce_amd_data); + u64 mca_intr_cfg, offset; + + if (!mce_flags.smca || !mce_flags.succor) + return; + + if (rdmsrq_safe(MSR_CU_DEF_ERR, &mca_intr_cfg)) + return; + + offset = (mca_intr_cfg & SMCA_THR_LVT_OFF) >> 12; + if (!setup_APIC_eilvt(offset, THRESHOLD_APIC_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->thr_intr_en = 1; + + offset = (mca_intr_cfg & MASK_DEF_LVTOFF) >> 4; + if (!setup_APIC_eilvt(offset, DEFERRED_ERROR_VECTOR, APIC_EILVT_MSG_FIX, 0)) + data->dfr_intr_en = 1; } /* cpu init entry point, called from mce.c with preempt off */ @@ -675,11 +719,20 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) u32 low = 0, high = 0, address = 0; int offset = -1; + amd_apply_cpu_quirks(c); + + mce_flags.amd_threshold = 1; + + smca_enable_interrupt_vectors(); for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (mce_flags.smca) + if (mce_flags.smca) { smca_configure(bank, cpu); + if (!this_cpu_ptr(&mce_amd_data)->thr_intr_en) + continue; + } + disable_err_thresholding(c, bank); for (block = 0; block < NR_BLOCKS; ++block) { @@ -700,9 +753,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) offset = prepare_threshold_block(bank, block, address, offset, high); } } +} - if (mce_flags.succor) - deferred_error_interrupt_enable(c); +void smca_bsp_init(void) +{ + mce_threshold_vector = amd_threshold_interrupt; + deferred_error_int_vector = amd_deferred_error_interrupt; } /* @@ -739,9 +795,9 @@ bool amd_mce_is_memory_error(struct mce *m) } /* - * AMD systems do not have an explicit indicator that the value in MCA_ADDR is - * a system physical address. Therefore, individual cases need to be detected. - * Future cases and checks will be added as needed. + * Some AMD systems have an explicit indicator that the value in MCA_ADDR is a + * system physical address. Individual cases though, need to be detected for + * other systems. Future cases will be added as needed. * * 1) General case * a) Assume address is not usable. @@ -755,6 +811,8 @@ bool amd_mce_is_memory_error(struct mce *m) * a) Reported in legacy bank 4 with extended error code (XEC) 8. * b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore, * this bit should not be checked. + * 4) MCI_STATUS_PADDRVAL is set + * a) Will provide a valid system physical address. * * NOTE: SMCA UMC memory errors fall into case #1. */ @@ -768,6 +826,9 @@ bool amd_mce_usable_address(struct mce *m) return false; } + if (this_cpu_ptr(smca_banks)[m->bank].paddrv) + return m->status & MCI_STATUS_PADDRV; + /* Check poison bit for all other bank types. */ if (m->status & MCI_STATUS_POISON) return true; @@ -776,37 +837,6 @@ bool amd_mce_usable_address(struct mce *m) return false; } -static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) -{ - struct mce_hw_err err; - struct mce *m = &err.m; - - mce_prep_record(&err); - - m->status = status; - m->misc = misc; - m->bank = bank; - m->tsc = rdtsc(); - - if (m->status & MCI_STATUS_ADDRV) { - m->addr = addr; - - smca_extract_err_addr(m); - } - - if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - - if (m->status & MCI_STATUS_SYNDV) { - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); - } - } - - mce_log(&err); -} - DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); @@ -816,103 +846,20 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) apic_eoi(); } -/* - * Returns true if the logged error is deferred. False, otherwise. - */ -static inline bool -_log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) -{ - u64 status, addr = 0; - - rdmsrl(msr_stat, status); - if (!(status & MCI_STATUS_VAL)) - return false; - - if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); - - __log_error(bank, status, addr, misc); - - wrmsrl(msr_stat, 0); - - return status & MCI_STATUS_DEFERRED; -} - -static bool _log_error_deferred(unsigned int bank, u32 misc) -{ - if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), - mca_msr_reg(bank, MCA_ADDR), misc)) - return false; - - /* - * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers. - * Return true here to avoid accessing these registers. - */ - if (!mce_flags.smca) - return true; - - /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); - return true; -} - -/* - * We have three scenarios for checking for Deferred errors: - * - * 1) Non-SMCA systems check MCA_STATUS and log error if found. - * 2) SMCA systems check MCA_STATUS. If error is found then log it and also - * clear MCA_DESTAT. - * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and - * log it. - */ -static void log_error_deferred(unsigned int bank) -{ - if (_log_error_deferred(bank, 0)) - return; - - /* - * Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check - * for a valid error. - */ - _log_error_bank(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), - MSR_AMD64_SMCA_MCx_DEADDR(bank), 0); -} - /* APIC interrupt handler for deferred errors */ static void amd_deferred_error_interrupt(void) { - unsigned int bank; - - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) - log_error_deferred(bank); + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->dfr_intr_banks); } -static void log_error_thresholding(unsigned int bank, u64 misc) +void mce_amd_handle_storm(unsigned int bank, bool on) { - _log_error_deferred(bank, misc); + threshold_restart_bank(bank, on); } -static void log_and_reset_block(struct threshold_block *block) +static void amd_reset_thr_limit(unsigned int bank) { - struct thresh_restart tr; - u32 low = 0, high = 0; - - if (!block) - return; - - if (rdmsr_safe(block->address, &low, &high)) - return; - - if (!(high & MASK_OVERFLOW_HI)) - return; - - /* Log the MCE which caused the threshold event. */ - log_error_thresholding(block->bank, ((u64)high << 32) | low); - - /* Reset threshold block after logging error. */ - memset(&tr, 0, sizeof(tr)); - tr.b = block; - threshold_restart_bank(&tr); + threshold_restart_bank(bank, true); } /* @@ -921,34 +868,22 @@ static void log_and_reset_block(struct threshold_block *block) */ static void amd_threshold_interrupt(void) { - struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; - struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank, cpu = smp_processor_id(); + machine_check_poll(MCP_TIMESTAMP, &this_cpu_ptr(&mce_amd_data)->thr_intr_banks); +} - /* - * Validate that the threshold bank has been initialized already. The - * handler is installed at boot time, but on a hotplug event the - * interrupt might fire before the data has been initialized. - */ - if (!bp) - return; +void amd_clear_bank(struct mce *m) +{ + amd_reset_thr_limit(m->bank); - for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { - if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank))) - continue; + /* Clear MCA_DESTAT for all deferred errors even those logged in MCA_STATUS. */ + if (m->status & MCI_STATUS_DEFERRED) + mce_wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank), 0); - first_block = bp[bank]->blocks; - if (!first_block) - continue; + /* Don't clear MCA_STATUS if MCA_DESTAT was used exclusively. */ + if (m->kflags & MCE_CHECK_DFR_REGS) + return; - /* - * The first block is also the head of the list. Check it first - * before iterating over the rest. - */ - log_and_reset_block(first_block); - list_for_each_entry_safe(block, tmp, &first_block->miscj, miscj) - log_and_reset_block(block); - } + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); } /* @@ -986,7 +921,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1011,7 +946,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + if (smp_call_function_single(b->cpu, threshold_restart_block, &tr, 1)) return -ENODEV; return size; @@ -1103,13 +1038,20 @@ static const char *get_name(unsigned int cpu, unsigned int bank, struct threshol } bank_type = smca_get_bank_type(cpu, bank); - if (bank_type >= N_SMCA_BANK_TYPES) - return NULL; if (b && (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2)) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) return smca_umc_block_names[b->block]; - return NULL; + } + + if (b && b->block) { + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_block_%u", b->block); + return buf_mcatype; + } + + if (bank_type >= N_SMCA_BANK_TYPES) { + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, "th_bank_%u", bank); + return buf_mcatype; } if (per_cpu(smca_bank_counts, cpu)[bank_type] == 1) @@ -1156,7 +1098,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb b->address = address; b->interrupt_enable = 0; b->interrupt_capable = lvt_interrupt_supported(bank, high); - b->threshold_limit = THRESHOLD_MAX; + b->threshold_limit = get_thr_limit(); if (b->interrupt_capable) { default_attrs[2] = &interrupt_enable.attr; @@ -1165,13 +1107,9 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb default_attrs[2] = NULL; } - INIT_LIST_HEAD(&b->miscj); + list_add(&b->miscj, &tb->miscj); - /* This is safe as @tb is not visible yet */ - if (tb->blocks) - list_add(&b->miscj, &tb->blocks->miscj); - else - tb->blocks = b; + mce_threshold_block_init(b, (high & MASK_LVTOFF_HI) >> 20); err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(cpu, bank, b)); if (err) @@ -1198,35 +1136,10 @@ out_free: return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1234,26 +1147,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_amd_node_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1267,16 +1160,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } + INIT_LIST_HEAD(&b->miscj); err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) @@ -1298,55 +1182,15 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_blocks(struct threshold_bank *bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&bank->blocks->kobj); -} - -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - -static void threshold_remove_bank(struct threshold_bank *bank) -{ - struct amd_northbridge *nb; - - if (!bank->blocks) - goto out_free; - - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: - deallocate_threshold_blocks(bank); - -out_free: kobject_put(bank->kobj); kfree(bank); } @@ -1365,12 +1209,12 @@ static void __threshold_remove_device(struct threshold_bank **bp) kfree(bp); } -int mce_threshold_remove_device(unsigned int cpu) +void mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); if (!bp) - return 0; + return; /* * Clear the pointer before cleaning up, so that the interrupt won't @@ -1379,7 +1223,7 @@ int mce_threshold_remove_device(unsigned int cpu) this_cpu_write(threshold_banks, NULL); __threshold_remove_device(bp); - return 0; + return; } /** @@ -1393,36 +1237,34 @@ int mce_threshold_remove_device(unsigned int cpu) * thread running on @cpu. The callback is invoked on all CPUs which are * online when the callback is installed or during a real hotplug event. */ -int mce_threshold_create_device(unsigned int cpu) +void mce_threshold_create_device(unsigned int cpu) { unsigned int numbanks, bank; struct threshold_bank **bp; - int err; if (!mce_flags.amd_threshold) - return 0; + return; bp = this_cpu_read(threshold_banks); if (bp) - return 0; + return; numbanks = this_cpu_read(mce_num_banks); bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) - return -ENOMEM; + return; for (bank = 0; bank < numbanks; ++bank) { if (!(this_cpu_read(bank_map) & BIT_ULL(bank))) continue; - err = threshold_create_bank(bp, cpu, bank); - if (err) { + if (threshold_create_bank(bp, cpu, bank)) { __threshold_remove_device(bp); - return err; + return; } } this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; + return; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7fb5556a0b53..34440021e8cf 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -45,6 +45,7 @@ #include <linux/task_work.h> #include <linux/hardirq.h> #include <linux/kexec.h> +#include <linux/vmcore_info.h> #include <asm/fred.h> #include <asm/cpu_device_id.h> @@ -121,7 +122,7 @@ void mce_prep_record_common(struct mce *m) { m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ m->time = __ktime_get_real_seconds(); } @@ -151,7 +152,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(injectm); void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(err)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -388,9 +389,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -423,7 +424,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -444,7 +445,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -468,7 +469,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs instrumentation_end(); m = &err->m; - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -488,14 +489,14 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } -int mce_available(struct cpuinfo_x86 *c) +bool mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) - return 0; + return false; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -584,6 +585,28 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +static bool mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return true; + } + + return false; +} + static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -662,10 +685,13 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) struct mce *m = &err->m; if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + if (m->kflags & MCE_CHECK_DFR_REGS) + m->addr = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DEADDR(i)); + else + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -680,12 +706,12 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); if (m->status & MCI_STATUS_SYNDV) { - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); - err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); - err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); } } } @@ -693,6 +719,86 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) DEFINE_PER_CPU(unsigned, mce_poll_count); /* + * We have three scenarios for checking for Deferred errors: + * + * 1) Non-SMCA systems check MCA_STATUS and log error if found. + * 2) SMCA systems check MCA_STATUS. If error is found then log it and also + * clear MCA_DESTAT. + * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and + * log it. + */ +static bool smca_should_log_poll_error(struct mce *m) +{ + if (m->status & MCI_STATUS_VAL) + return true; + + m->status = mce_rdmsrq(MSR_AMD64_SMCA_MCx_DESTAT(m->bank)); + if ((m->status & MCI_STATUS_VAL) && (m->status & MCI_STATUS_DEFERRED)) { + m->kflags |= MCE_CHECK_DFR_REGS; + return true; + } + + return false; +} + +/* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ +static bool ser_should_log_poll_error(struct mce *m) +{ + /* Log "not enabled" (speculative) errors */ + if (!(m->status & MCI_STATUS_EN)) + return true; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) + return true; + + return false; +} + +static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + if (mce_flags.smca) + return smca_should_log_poll_error(m); + + /* If this entry is not valid, ignore it. */ + if (!(m->status & MCI_STATUS_VAL)) + return false; + + /* + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. + */ + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) + return true; + + if (mca_cfg.ser) + return ser_should_log_poll_error(m); + + if (m->status & MCI_STATUS_UC) + return false; + + return true; +} + +static void clear_bank(struct mce *m) +{ + if (m->cpuvendor == X86_VENDOR_AMD) + return amd_clear_bank(m); + + mce_wrmsrq(mca_msr_reg(m->bank, MCA_STATUS), 0); +} + +/* * Poll for corrected events or events that happened before reset. * Those are just logged through /dev/mcelog. * @@ -731,7 +837,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m->bank = i; barrier(); - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -743,51 +849,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!mca_cfg.cmci_disabled) mce_track_storm(m); - /* If this entry is not valid, ignore it */ - if (!(m->status & MCI_STATUS_VAL)) + /* Verify that the error should be logged based on hardware conditions. */ + if (!should_log_poll_error(flags, &err)) continue; - /* - * If we are logging everything (at CPU online) or this - * is a corrected error, then we must log it. - */ - if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) - goto log_it; - - /* - * Newer Intel systems that support software error - * recovery need to make additional checks. Other - * CPUs should skip over uncorrected errors, but log - * everything else. - */ - if (!mca_cfg.ser) { - if (m->status & MCI_STATUS_UC) - continue; - goto log_it; - } - - /* Log "not enabled" (speculative) errors */ - if (!(m->status & MCI_STATUS_EN)) - goto log_it; - - /* - * Log UCNA (SDM: 15.6.3 "UCR Error Classification") - * UC == 1 && PCC == 0 && S == 0 - */ - if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) - goto log_it; - - /* - * Skip anything else. Presumption is that our read of this - * bank is racing with a machine check. Leave the log alone - * for do_machine_check() to deal with it. - */ - continue; - -log_it: - if (flags & MCP_DONTLOG) - goto clear_it; - mce_read_aux(&err, i); m->severity = mce_severity(m, NULL, NULL, false); /* @@ -804,10 +869,7 @@ log_it: mce_log(&err); clear_it: - /* - * Clear state for this bank. - */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + clear_bank(m); } /* @@ -865,8 +927,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -877,7 +939,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -888,8 +950,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -933,7 +995,7 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1252,7 +1314,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1276,7 +1338,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1284,7 +1346,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1313,7 +1375,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1668,10 +1730,13 @@ noinstr void do_machine_check(struct pt_regs *regs) } out: + /* Given it didn't panic, mark it as recoverable */ + hwerr_log_error_type(HWERR_RECOV_OTHERS); + instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1718,6 +1783,11 @@ static void mc_poll_banks_default(void) void (*mc_poll_banks)(void) = mc_poll_banks_default; +static bool should_enable_timer(unsigned long iv) +{ + return !mca_cfg.ignore_ce && iv; +} + static void mce_timer_fn(struct timer_list *t) { struct timer_list *cpu_t = this_cpu_ptr(&mce_timer); @@ -1741,7 +1811,7 @@ static void mce_timer_fn(struct timer_list *t) if (mce_get_storm_mode()) { __start_timer(t, HZ); - } else { + } else if (should_enable_timer(iv)) { __this_cpu_write(mce_next_interval, iv); __start_timer(t, iv); } @@ -1764,36 +1834,14 @@ void mce_timer_kick(bool storm) __this_cpu_write(mce_next_interval, check_interval * HZ); } -/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +/* Must not be called in IRQ context where timer_delete_sync() can deadlock */ static void mce_timer_delete_all(void) { int cpu; for_each_online_cpu(cpu) - del_timer_sync(&per_cpu(mce_timer, cpu)); -} - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return 1; - } - return 0; + timer_delete_sync(&per_cpu(mce_timer, cpu)); } -EXPORT_SYMBOL_GPL(mce_notify_irq); static void __mcheck_cpu_mce_banks_init(void) { @@ -1805,9 +1853,10 @@ static void __mcheck_cpu_mce_banks_init(void) struct mce_bank *b = &mce_banks[i]; /* - * Init them all, __mcheck_cpu_apply_quirks() is going to apply - * the required vendor quirks before - * __mcheck_cpu_init_clear_banks() does the final bank setup. + * Init them all by default. + * + * The required vendor quirks will be applied before + * __mcheck_cpu_init_prepare_banks() does the final bank setup. */ b->ctl = -1ULL; b->init = true; @@ -1822,7 +1871,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1835,69 +1884,34 @@ static void __mcheck_cpu_cap_init(void) this_cpu_write(mce_num_banks, b); __mcheck_cpu_mce_banks_init(); - - /* Use accurate RIP reporting if available. */ - if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - mca_cfg.rip_msr = MSR_IA32_MCG_EIP; - - if (cap & MCG_SER_P) - mca_cfg.ser = 1; } static void __mcheck_cpu_init_generic(void) { - enum mcp_flags m_fl = 0; - mce_banks_t all_banks; u64 cap; - if (!mca_cfg.bootlog) - m_fl = MCP_DONTLOG; - - /* - * Log the machine checks left over from the previous reset. Log them - * only, do not start processing them. That will happen in mcheck_late_init() - * when all consumers have been registered on the notifier chain. - */ - bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC | MCP_QUEUE_LOG | m_fl, &all_banks); - - cr4_set_bits(X86_CR4_MCE); - - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } -static void __mcheck_cpu_init_clear_banks(void) +static void __mcheck_cpu_init_prepare_banks(void) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; int i; - for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - struct mce_bank *b = &mce_banks[i]; + /* + * Log the machine checks left over from the previous reset. Log them + * only, do not start processing them. That will happen in mcheck_late_init() + * when all consumers have been registered on the notifier chain. + */ + if (mca_cfg.bootlog) { + mce_banks_t all_banks; - if (!b->init) - continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | MCP_QUEUE_LOG, &all_banks); } -} - -/* - * Do a final check to see if there are any unused/RAZ banks. - * - * This must be done after the banks have been initialized and any quirks have - * been applied. - * - * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. - * Otherwise, a user who disables a bank will not be able to re-enable it - * without a system reboot. - */ -static void __mcheck_cpu_check_banks(void) -{ - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - u64 msrval; - int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; @@ -1905,148 +1919,97 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); + + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void amd_apply_global_quirks(struct cpuinfo_x86 *c) { - struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } - - /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } - /* - * Various K7s with broken bank 0 around. Always disable - * by default. - */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; - + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; - - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; - + mca_cfg.bootlog = 0; } - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ - - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; +static void intel_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; - if (c->x86_vfm == INTEL_SANDYBRIDGE_X) - mce_flags.snb_ifu_quirk = 1; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86_vfm == INTEL_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; - } + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; - if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } - } + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = 0; - if (cfg->bootlog != 0) - cfg->panic_timeout = 30; + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} - return 0; +static void zhaoxin_apply_global_quirks(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; + } } -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return 0; + return false; switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); mce_flags.p5 = 1; - return 1; + return true; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); mce_flags.winchip = 1; - return 1; + return true; default: - return 0; + return false; } - return 0; -} - -/* - * Init basic CPU features needed for early decoding of MCEs. - */ -static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) -{ - if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { - mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); - mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); - mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); - mce_flags.amd_threshold = 1; - } + return false; } static void mce_centaur_feature_init(struct cpuinfo_x86 *c) @@ -2099,13 +2062,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); break; - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); - break; - } - + case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); + mce_amd_feature_init(c); break; case X86_VENDOR_CENTAUR: @@ -2141,11 +2100,10 @@ static void mce_start_timer(struct timer_list *t) { unsigned long iv = check_interval * HZ; - if (mca_cfg.ignore_ce || !iv) - return; - - this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (should_enable_timer(iv)) { + this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } static void __mcheck_cpu_setup_timer(void) @@ -2262,6 +2220,53 @@ DEFINE_IDTENTRY_RAW(exc_machine_check) } #endif +void mca_bsp_init(struct cpuinfo_x86 *c) +{ + u64 cap; + + if (!mce_available(c)) + return; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + mca_cfg.disabled = 1; + pr_info("unknown CPU type - not enabling MCE support\n"); + return; + } + + mce_flags.overflow_recov = cpu_feature_enabled(X86_FEATURE_OVERFLOW_RECOV); + mce_flags.succor = cpu_feature_enabled(X86_FEATURE_SUCCOR); + mce_flags.smca = cpu_feature_enabled(X86_FEATURE_SMCA); + + if (mce_flags.smca) + smca_bsp_init(); + + rdmsrq(MSR_IA32_MCG_CAP, cap); + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = 1; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_apply_global_quirks(c); + break; + case X86_VENDOR_INTEL: + intel_apply_global_quirks(c); + break; + case X86_VENDOR_ZHAOXIN: + zhaoxin_apply_global_quirks(c); + break; + } + + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = 0; + if (mca_cfg.bootlog != 0) + mca_cfg.panic_timeout = 30; +} + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: @@ -2279,12 +2284,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { - mca_cfg.disabled = 1; - return; - } - - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; @@ -2292,12 +2292,11 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) mca_cfg.initialized = 1; - __mcheck_cpu_init_early(c); __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); - __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_check_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_setup_timer(); + cr4_set_bits(X86_CR4_MCE); } /* @@ -2421,7 +2420,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2444,13 +2443,13 @@ static void vendor_disable_error_reporting(void) mce_disable_error_reporting(); } -static int mce_syscore_suspend(void) +static int mce_syscore_suspend(void *data) { vendor_disable_error_reporting(); return 0; } -static void mce_syscore_shutdown(void) +static void mce_syscore_shutdown(void *data) { vendor_disable_error_reporting(); } @@ -2460,19 +2459,24 @@ static void mce_syscore_shutdown(void) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static void mce_syscore_resume(void) +static void mce_syscore_resume(void *data) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info)); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); + cr4_set_bits(X86_CR4_MCE); } -static struct syscore_ops mce_syscore_ops = { +static const struct syscore_ops mce_syscore_ops = { .suspend = mce_syscore_suspend, .shutdown = mce_syscore_shutdown, .resume = mce_syscore_resume, }; +static struct syscore mce_syscore = { + .ops = &mce_syscore_ops, +}; + /* * mce_device: Sysfs support */ @@ -2482,8 +2486,9 @@ static void mce_cpu_restart(void *data) if (!mce_available(raw_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); - __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_init_prepare_banks(); __mcheck_cpu_init_timer(); + cr4_set_bits(X86_CR4_MCE); } /* Reinit MCEs after user configuration changes */ @@ -2771,7 +2776,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } @@ -2786,15 +2791,9 @@ static int mce_cpu_dead(unsigned int cpu) static int mce_cpu_online(unsigned int cpu) { struct timer_list *t = this_cpu_ptr(&mce_timer); - int ret; mce_device_create(cpu); - - ret = mce_threshold_create_device(cpu); - if (ret) { - mce_device_remove(cpu); - return ret; - } + mce_threshold_create_device(cpu); mce_reenable_cpu(); mce_start_timer(t); return 0; @@ -2805,7 +2804,7 @@ static int mce_cpu_pre_down(unsigned int cpu) struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); - del_timer_sync(t); + timer_delete_sync(t); mce_threshold_remove_device(cpu); mce_device_remove(cpu); return 0; @@ -2878,7 +2877,7 @@ static __init int mcheck_init_device(void) if (err < 0) goto err_out_online; - register_syscore_ops(&mce_syscore_ops); + register_syscore(&mce_syscore); return 0; diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index d0be6dda0c14..3ca9c007a666 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -94,64 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce_hw_err *err) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; if (filter_mce(&err->m)) - return -EINVAL; + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { int mce_numrecords, mce_poolsz, order; struct gen_pool *gpool; - int ret = -ENOMEM; void *mce_pool; order = order_base_2(sizeof(struct mce_evt_llist)); gpool = gen_pool_create(order, -1); if (!gpool) - return ret; + return false; mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); mce_poolsz = mce_numrecords * (1 << order); mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); if (!mce_pool) { gen_pool_destroy(gpool); - return ret; + return false; } - ret = gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1); - if (ret) { + + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { gen_pool_destroy(gpool); kfree(mce_pool); - return ret; + return false; } mce_evt_pool = gpool; - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 313fe682db33..d02c4f556cd0 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -24,10 +24,11 @@ #include <linux/pci.h> #include <linux/uaccess.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/apic.h> #include <asm/irq_vectors.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -229,7 +230,6 @@ static int raise_local(void) } else if (m->status) { pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); - mce_notify_irq(); pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; @@ -476,27 +476,27 @@ static void prepare_msrs(void *info) struct mce m = *(struct mce *)info; u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); if (m.misc) - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrq(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr); if (m.misc) - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + wrmsrq(MSR_IA32_MCx_MISC(b), m.misc); } } @@ -590,7 +590,7 @@ static int inj_bank_set(void *data, u64 val) u64 cap; /* Get bank count on target CPU so we can handle non-uniform values. */ - rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { @@ -614,7 +614,7 @@ static int inj_bank_set(void *data, u64 val) if (cpu_feature_enabled(X86_FEATURE_SMCA)) { u64 ipid; - if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { pr_err("Error reading IPID on CPU%d\n", m->extcpu); return -EINVAL; } @@ -742,15 +742,15 @@ static void check_hw_inj_possible(void) u64 status = MCI_STATUS_VAL, ipid; /* Check whether bank is populated */ - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); if (!ipid) continue; toggle_hw_mce_inject(cpu, true); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); - rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index b3cd2c61b11d..4655223ba560 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS]; */ #define CMCI_STORM_THRESHOLD 32749 -static int cmci_supported(int *banks) +static bool cmci_supported(int *banks) { u64 cap; if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) - return 0; + return false; /* * Vendor check is not strictly needed, but the initial @@ -89,11 +89,12 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 0; + return false; if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); + return false; + + rdmsrq(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -105,7 +106,7 @@ static bool lmce_supported(void) if (mca_cfg.lmce_disabled) return false; - rdmsrl(MSR_IA32_MCG_CAP, tmp); + rdmsrq(MSR_IA32_MCG_CAP, tmp); /* * LMCE depends on recovery support in the processor. Hence both @@ -122,7 +123,7 @@ static bool lmce_supported(void) * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEAT_CTL, tmp); + rdmsrq(MSR_IA32_FEAT_CTL, tmp); if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) return false; @@ -140,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh) u64 val; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -183,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val) if (test_bit(bank, mce_banks_ce_disabled)) return true; - rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), *val); /* Already owned by someone else? */ if (*val & MCI_CTL2_CMCI_EN) { @@ -231,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); /* If the enable bit did not stick, this bank should be polled. */ if (!(val & MCI_CTL2_CMCI_EN)) { @@ -323,9 +324,9 @@ static void __cmci_disable_bank(int bank) if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) @@ -429,10 +430,10 @@ void intel_init_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } void intel_clear_lmce(void) @@ -442,9 +443,9 @@ void intel_clear_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val); } /* @@ -459,16 +460,34 @@ static void intel_imc_init(struct cpuinfo_x86 *c) case INTEL_SANDYBRIDGE_X: case INTEL_IVYBRIDGE_X: case INTEL_HASWELL_X: - if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; - wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + wrmsrq_safe(MSR_ERROR_CONTROL, error_control); break; } } +static void intel_apply_cpu_quirks(struct cpuinfo_x86 *c) +{ + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + * + * Older CPUs (prior to family 6) can't reach this point and already + * return early due to the check of __mcheck_cpu_ancient_init(). + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + this_cpu_ptr(mce_banks_array)[0].init = false; +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { + intel_apply_cpu_quirks(c); intel_init_cmci(); intel_init_lmce(); intel_imc_init(c); @@ -477,6 +496,7 @@ void mce_intel_feature_init(struct cpuinfo_x86 *c) void mce_intel_feature_clear(struct cpuinfo_x86 *c) { intel_clear_lmce(); + cmci_clear(); } bool intel_filter_mce(struct mce *m) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 84f810598231..a31cf984619c 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -31,8 +31,8 @@ struct mce_evt_llist { void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce_hw_err *err); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); @@ -67,6 +67,7 @@ void mce_track_storm(struct mce *mce); void mce_inherit_storm(unsigned int bank); bool mce_get_storm_mode(void); void mce_set_storm_mode(bool storm); +u32 mce_get_apei_thr_limit(void); #else static inline void cmci_storm_begin(unsigned int bank) {} static inline void cmci_storm_end(unsigned int bank) {} @@ -74,6 +75,7 @@ static inline void mce_track_storm(struct mce *mce) {} static inline void mce_inherit_storm(unsigned int bank) {} static inline bool mce_get_storm_mode(void) { return false; } static inline void mce_set_storm_mode(bool storm) {} +static inline u32 mce_get_apei_thr_limit(void) { return 0; } #endif /* @@ -265,8 +267,12 @@ void mce_prep_record_common(struct mce *m); void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD +void mce_threshold_create_device(unsigned int cpu); +void mce_threshold_remove_device(unsigned int cpu); +void mce_amd_handle_storm(unsigned int bank, bool on); extern bool amd_filter_mce(struct mce *m); bool amd_mce_usable_address(struct mce *m); +void amd_clear_bank(struct mce *m); /* * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits @@ -292,10 +298,16 @@ static __always_inline void smca_extract_err_addr(struct mce *m) m->addr &= GENMASK_ULL(55, lsb); } +void smca_bsp_init(void); #else +static inline void mce_threshold_create_device(unsigned int cpu) { } +static inline void mce_threshold_remove_device(unsigned int cpu) { } +static inline void mce_amd_handle_storm(unsigned int bank, bool on) { } static inline bool amd_filter_mce(struct mce *m) { return false; } static inline bool amd_mce_usable_address(struct mce *m) { return false; } +static inline void amd_clear_bank(struct mce *m) { } static inline void smca_extract_err_addr(struct mce *m) { } +static inline void smca_bsp_init(void) { } #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -312,7 +324,8 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {} static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif -noinstr u64 mce_rdmsrl(u32 msr); +noinstr u64 mce_rdmsrq(u32 msr); +noinstr void mce_wrmsrq(u32 msr, u64 v); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index dac4d64dfb2a..2235a7477436 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -300,13 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) copy_user = is_copy_from_user(regs); instrumentation_end(); - switch (fixup_type) { - case EX_TYPE_UACCESS: - if (!copy_user) - return IN_KERNEL; - m->kflags |= MCE_IN_KERNEL_COPYIN; - fallthrough; + if (copy_user) { + m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV; + return IN_KERNEL_RECOV; + } + switch (fixup_type) { case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 89e31e1e5c9c..0d13c9ffcba0 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -13,6 +13,19 @@ #include "internal.h" +static u32 mce_apei_thr_limit; + +void mce_save_apei_thr_limit(u32 thr_limit) +{ + mce_apei_thr_limit = thr_limit; + pr_info("HEST corrected error threshold limit: %u\n", thr_limit); +} + +u32 mce_get_apei_thr_limit(void) +{ + return mce_apei_thr_limit; +} + static void default_threshold_interrupt(void) { pr_err("Unexpected threshold interrupt at vector %x\n", @@ -63,6 +76,9 @@ static void mce_handle_storm(unsigned int bank, bool on) case X86_VENDOR_INTEL: mce_intel_handle_storm(bank, on); break; + case X86_VENDOR_AMD: + mce_amd_handle_storm(bank, on); + break; } } @@ -85,12 +101,13 @@ void cmci_storm_end(unsigned int bank) { struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); - __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + if (!mce_flags.amd_threshold) + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); storm->banks[bank].history = 0; storm->banks[bank].in_storm_mode = false; /* If no banks left in storm mode, stop polling. */ - if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + if (!--storm->stormy_bank_count) mce_timer_kick(false); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 31a73715d755..3821a985f4ff 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -23,17 +23,22 @@ #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/bsearch.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/initrd.h> #include <linux/kernel.h> #include <linux/pci.h> +#include <crypto/sha2.h> + #include <asm/microcode.h> #include <asm/processor.h> +#include <asm/cmdline.h> #include <asm/setup.h> #include <asm/cpu.h> #include <asm/msr.h> +#include <asm/tlb.h> #include "internal.h" @@ -144,6 +149,158 @@ ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; */ static u32 bsp_cpuid_1_eax __ro_after_init; +static bool sha_check = true; + +struct patch_digest { + u32 patch_id; + u8 sha256[SHA256_DIGEST_SIZE]; +}; + +#include "amd_shas.c" + +static int cmp_id(const void *key, const void *elem) +{ + struct patch_digest *pd = (struct patch_digest *)elem; + u32 patch_id = *(u32 *)key; + + if (patch_id == pd->patch_id) + return 0; + else if (patch_id < pd->patch_id) + return -1; + else + return 1; +} + +static u32 cpuid_to_ucode_rev(unsigned int val) +{ + union zen_patch_rev p = {}; + union cpuid_1_eax c; + + c.full = val; + + p.stepping = c.stepping; + p.model = c.model; + p.ext_model = c.ext_model; + p.ext_fam = c.ext_fam; + + return p.ucode_rev; +} + +static u32 get_cutoff_revision(u32 rev) +{ + switch (rev >> 8) { + case 0x80012: return 0x8001277; break; + case 0x80082: return 0x800820f; break; + case 0x83010: return 0x830107c; break; + case 0x86001: return 0x860010e; break; + case 0x86081: return 0x8608108; break; + case 0x87010: return 0x8701034; break; + case 0x8a000: return 0x8a0000a; break; + case 0xa0010: return 0xa00107a; break; + case 0xa0011: return 0xa0011da; break; + case 0xa0012: return 0xa001243; break; + case 0xa0082: return 0xa00820e; break; + case 0xa1011: return 0xa101153; break; + case 0xa1012: return 0xa10124e; break; + case 0xa1081: return 0xa108109; break; + case 0xa2010: return 0xa20102f; break; + case 0xa2012: return 0xa201212; break; + case 0xa4041: return 0xa404109; break; + case 0xa5000: return 0xa500013; break; + case 0xa6012: return 0xa60120a; break; + case 0xa7041: return 0xa704109; break; + case 0xa7052: return 0xa705208; break; + case 0xa7080: return 0xa708009; break; + case 0xa70c0: return 0xa70C009; break; + case 0xaa001: return 0xaa00116; break; + case 0xaa002: return 0xaa00218; break; + case 0xb0021: return 0xb002146; break; + case 0xb0081: return 0xb008111; break; + case 0xb1010: return 0xb101046; break; + case 0xb2040: return 0xb204031; break; + case 0xb4040: return 0xb404031; break; + case 0xb4041: return 0xb404101; break; + case 0xb6000: return 0xb600031; break; + case 0xb6080: return 0xb608031; break; + case 0xb7000: return 0xb700031; break; + default: break; + + } + return 0; +} + +static bool need_sha_check(u32 cur_rev) +{ + u32 cutoff; + + if (!cur_rev) { + cur_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + pr_info_once("No current revision, generating the lowest one: 0x%x\n", cur_rev); + } + + cutoff = get_cutoff_revision(cur_rev); + if (cutoff) + return cur_rev <= cutoff; + + pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n"); + pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); + return true; +} + +static bool cpu_has_entrysign(void) +{ + unsigned int fam = x86_family(bsp_cpuid_1_eax); + unsigned int model = x86_model(bsp_cpuid_1_eax); + + if (fam == 0x17 || fam == 0x19) + return true; + + if (fam == 0x1a) { + if (model <= 0x2f || + (0x40 <= model && model <= 0x4f) || + (0x60 <= model && model <= 0x6f)) + return true; + } + + return false; +} + +static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len) +{ + struct patch_digest *pd = NULL; + u8 digest[SHA256_DIGEST_SIZE]; + int i; + + if (!cpu_has_entrysign()) + return true; + + if (!need_sha_check(cur_rev)) + return true; + + if (!sha_check) + return true; + + pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id); + if (!pd) { + pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id); + return false; + } + + sha256(data, len, digest); + + if (memcmp(digest, pd->sha256, sizeof(digest))) { + pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); + + for (i = 0; i < SHA256_DIGEST_SIZE; i++) + pr_cont("0x%x ", digest[i]); + pr_info("\n"); + + return false; + } + + return true; +} + static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) { union zen_patch_rev p; @@ -161,6 +318,30 @@ static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) return c; } +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + int cpu = smp_processor_id(); + + if (!microcode_rev[cpu]) { + if (!base_rev) + base_rev = cpuid_to_ucode_rev(bsp_cpuid_1_eax); + + microcode_rev[cpu] = base_rev; + + ucode_dbg("CPU%d, base_rev: 0x%x\n", cpu, base_rev); + } + + return microcode_rev[cpu]; + } + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; @@ -190,13 +371,13 @@ static bool verify_container(const u8 *buf, size_t buf_size) u32 cont_magic; if (buf_size <= CONTAINER_HDR_SZ) { - pr_debug("Truncated microcode container header.\n"); + ucode_dbg("Truncated microcode container header.\n"); return false; } cont_magic = *(const u32 *)buf; if (cont_magic != UCODE_MAGIC) { - pr_debug("Invalid magic value (0x%08x).\n", cont_magic); + ucode_dbg("Invalid magic value (0x%08x).\n", cont_magic); return false; } @@ -221,8 +402,8 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { - pr_debug("Wrong microcode container equivalence table type: %u.\n", - cont_type); + ucode_dbg("Wrong microcode container equivalence table type: %u.\n", + cont_type); return false; } @@ -231,7 +412,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) equiv_tbl_len = hdr[2]; if (equiv_tbl_len < sizeof(struct equiv_cpu_entry) || buf_size < equiv_tbl_len) { - pr_debug("Truncated equivalence table.\n"); + ucode_dbg("Truncated equivalence table.\n"); return false; } @@ -245,14 +426,13 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) * On success, @sh_psize returns the patch size according to the section header, * to the caller. */ -static bool -__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) +static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) { u32 p_type, p_size; const u32 *hdr; if (buf_size < SECTION_HDR_SIZE) { - pr_debug("Truncated patch section.\n"); + ucode_dbg("Truncated patch section.\n"); return false; } @@ -261,13 +441,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) p_size = hdr[1]; if (p_type != UCODE_UCODE_TYPE) { - pr_debug("Invalid type field (0x%x) in container file section header.\n", - p_type); + ucode_dbg("Invalid type field (0x%x) in container file section header.\n", + p_type); return false; } if (p_size < sizeof(struct microcode_header_amd)) { - pr_debug("Patch of size %u too short.\n", p_size); + ucode_dbg("Patch of size %u too short.\n", p_size); return false; } @@ -282,13 +462,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) +static bool __verify_patch_size(u32 sh_psize, size_t buf_size) { u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) - return min_t(u32, sh_psize, buf_size); + goto ret; #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 @@ -302,13 +482,15 @@ static unsigned int __verify_patch_size(u32 sh_psize, size_t buf_size) break; default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); - return 0; + return false; } - if (sh_psize > min_t(u32, buf_size, max_size)) - return 0; + if (sh_psize > max_size) + return false; - return sh_psize; +ret: + /* Working with the whole buffer so < is ok. */ + return sh_psize <= buf_size; } /* @@ -323,7 +505,7 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; - unsigned int ret; + u32 cur_rev, cutoff, patch_rev; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -343,13 +525,12 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) * size sh_psize, as the section claims. */ if (buf_size < sh_psize) { - pr_debug("Patch of size %u truncated.\n", sh_psize); + ucode_dbg("Patch of size %u truncated.\n", sh_psize); return -1; } - ret = __verify_patch_size(sh_psize, buf_size); - if (!ret) { - pr_debug("Per-family patch size mismatch.\n"); + if (!__verify_patch_size(sh_psize, buf_size)) { + ucode_dbg("Per-family patch size mismatch.\n"); return -1; } @@ -363,9 +544,33 @@ static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) proc_id = mc_hdr->processor_rev_id; patch_fam = 0xf + (proc_id >> 12); + if (patch_fam != family) return 1; + cur_rev = get_patch_level(); + + /* No cutoff revision means old/unaffected by signing algorithm weakness => matches */ + cutoff = get_cutoff_revision(cur_rev); + if (!cutoff) + goto ok; + + patch_rev = mc_hdr->patch_id; + + ucode_dbg("cur_rev: 0x%x, cutoff: 0x%x, patch_rev: 0x%x\n", + cur_rev, cutoff, patch_rev); + + if (cur_rev <= cutoff && patch_rev <= cutoff) + goto ok; + + if (cur_rev > cutoff && patch_rev > cutoff) + goto ok; + + return 1; + +ok: + ucode_dbg("Patch-ID 0x%08x: family: 0x%x\n", mc_hdr->patch_id, patch_fam); + return 0; } @@ -380,8 +585,8 @@ static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) /* * This scans the ucode blob for the proper container as we can have multiple - * containers glued together. Returns the equivalence ID from the equivalence - * table or 0 if none found. + * containers glued together. + * * Returns the amount of bytes consumed while scanning. @desc contains all the * data we're going to use in later stages of the application. */ @@ -433,9 +638,12 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; + + ucode_dbg(" match: size: %d\n", patch_size); } skip: @@ -483,53 +691,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) } } -static int __apply_microcode_amd(struct microcode_amd *mc) +static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, + unsigned int psize) { - u32 rev, dummy; + unsigned long p_addr = (unsigned long)&mc->hdr.data_code; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) + return false; - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr); - if (rev != mc->hdr.patch_id) - return -1; + if (x86_family(bsp_cpuid_1_eax) == 0x17) { + unsigned long p_addr_end = p_addr + psize - 1; - return 0; -} + invlpg(p_addr); -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - * - * Returns true if container found (sets @desc), false otherwise. - */ -static bool early_apply_microcode(u32 old_rev, void *ucode, size_t size) -{ - struct cont_desc desc = { 0 }; - struct microcode_amd *mc; - bool ret = false; + /* + * Flush next page too if patch image is crossing a page + * boundary. + */ + if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT) + invlpg(p_addr_end); + } - scan_containers(ucode, size, &desc); + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) + microcode_rev[smp_processor_id()] = mc->hdr.patch_id; - mc = desc.mc; - if (!mc) - return ret; + /* verify patch application was successful */ + *cur_rev = get_patch_level(); - /* - * Allow application of the same revision to pick up SMT-specific - * changes even if the revision of the other SMT thread is already - * up-to-date. - */ - if (old_rev > mc->hdr.patch_id) - return ret; + ucode_dbg("updated rev: 0x%x\n", *cur_rev); + + if (*cur_rev != mc->hdr.patch_id) + return false; - return !__apply_microcode_amd(mc); + return true; } static bool get_builtin_microcode(struct cpio_data *cp) @@ -554,64 +750,74 @@ static bool get_builtin_microcode(struct cpio_data *cp) return false; } -static void __init find_blobs_in_containers(struct cpio_data *ret) +static bool __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; + bool found; if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); - *ret = cp; + found = cp.data && cp.size; + if (found) + *ret = cp; + + return found; } +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { + struct cont_desc desc = { }; + struct microcode_amd *mc; struct cpio_data cp = { }; - u32 dummy; + char buf[4]; + u32 rev; + + if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) { + if (!strncmp(buf, "off", 3)) { + sha_check = false; + pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + } bsp_cpuid_1_eax = cpuid_1_eax; - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); + rev = get_patch_level(); + ed->old_rev = rev; /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(&cp); - if (!(cp.data && cp.size)) + if (!find_blobs_in_containers(&cp)) return; - if (early_apply_microcode(ed->old_rev, cp.data, cp.size)) - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); -} - -static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size); - -static int __init save_microcode_in_initrd(void) -{ - unsigned int cpuid_1_eax = native_cpuid_eax(1); - struct cpuinfo_x86 *c = &boot_cpu_data; - struct cont_desc desc = { 0 }; - enum ucode_state ret; - struct cpio_data cp; - - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) - return 0; - - find_blobs_in_containers(&cp); - if (!(cp.data && cp.size)) - return -EINVAL; - scan_containers(cp.data, cp.size, &desc); - if (!desc.mc) - return -EINVAL; - ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret > UCODE_UPDATED) - return -EINVAL; + mc = desc.mc; + if (!mc) + return; - return 0; + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (ed->old_rev > mc->hdr.patch_id) + return; + + if (__apply_microcode_amd(mc, &rev, desc.psize)) + ed->new_rev = rev; } -early_initcall(save_microcode_in_initrd); static inline bool patch_cpus_equivalent(struct ucode_patch *p, struct ucode_patch *n, @@ -644,8 +850,6 @@ static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equi n.equiv_cpu = equiv_cpu; n.patch_id = uci->cpu_sig.rev; - WARN_ON_ONCE(!n.patch_id); - list_for_each_entry(p, µcode_cache, plist) if (patch_cpus_equivalent(p, &n, false)) return p; @@ -712,14 +916,9 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u32 rev, dummy __always_unused; u16 equiv_id = 0; - /* fetch rev if not populated yet: */ - if (!uci->cpu_sig.rev) { - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - uci->cpu_sig.rev = rev; - } + uci->cpu_sig.rev = get_patch_level(); if (x86_family(bsp_cpuid_1_eax) < 0x17) { equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); @@ -742,22 +941,20 @@ void reload_ucode_amd(unsigned int cpu) mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - + rev = get_patch_level(); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); + if (__apply_microcode_amd(mc, &rev, p->size)) + pr_info_once("reload revision: 0x%08x\n", rev); } } static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct ucode_patch *p; csig->sig = cpuid_eax(0x00000001); - csig->rev = c->microcode; + csig->rev = get_patch_level(); /* * a patch could have been loaded early, set uci->mc so that @@ -798,7 +995,7 @@ static enum ucode_state apply_microcode_amd(int cpu) goto out; } - if (__apply_microcode_amd(mc_amd)) { + if (!__apply_microcode_amd(mc_amd, &rev, p->size)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; @@ -910,7 +1107,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", + ucode_dbg("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -920,8 +1117,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, } /* Scan the blob in @data and add microcode patches to the cache. */ -static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, - size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { u8 *fw = (u8 *)data; size_t offset; @@ -979,7 +1175,7 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz if (ret != UCODE_OK) return ret; - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); @@ -996,6 +1192,34 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } +static int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; + enum ucode_state ret; + struct cpio_data cp; + + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + return 0; + + cpuid_1_eax = native_cpuid_eax(1); + + if (!find_blobs_in_containers(&cp)) + return -EINVAL; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} +early_initcall(save_microcode_in_initrd); + /* * AMD microcode firmware naming convention, up to family 15h they are in * the legacy file: @@ -1026,7 +1250,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device) snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); if (request_firmware_direct(&fw, (const char *)fw_name, device)) { - pr_debug("failed to load file %s\n", fw_name); + ucode_dbg("failed to load file %s\n", fw_name); goto out; } @@ -1050,11 +1274,18 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +static void finalize_late_load_amd(int result) +{ + if (result) + cleanup(); +} + static struct microcode_ops microcode_amd_ops = { .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, + .finalize_late_load = finalize_late_load_amd, .nmi_safe = true, }; diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c new file mode 100644 index 000000000000..1fd349cfc802 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_shas.c @@ -0,0 +1,556 @@ +/* Keep 'em sorted. */ +static const struct patch_digest phashes[] = { + { 0x8001227, { + 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b, + 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46, + 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8, + 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18, + } + }, + { 0x8001250, { + 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60, + 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa, + 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3, + 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19, + } + }, + { 0x800126e, { + 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c, + 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3, + 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6, + 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43, + } + }, + { 0x800126f, { + 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec, + 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b, + 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18, + 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33, + } + }, + { 0x800820d, { + 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59, + 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67, + 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c, + 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e, + } + }, + { 0x8301025, { + 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36, + 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1, + 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30, + 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77, + } + }, + { 0x8301055, { + 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a, + 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea, + 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b, + 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b, + } + }, + { 0x8301072, { + 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e, + 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28, + 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1, + 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30, + } + }, + { 0x830107a, { + 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72, + 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34, + 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20, + 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a, + } + }, + { 0x830107b, { + 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad, + 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a, + 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04, + 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19, + } + }, + { 0x830107c, { + 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47, + 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac, + 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3, + 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38, + } + }, + { 0x860010d, { + 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0, + 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7, + 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c, + 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8, + } + }, + { 0x8608108, { + 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2, + 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38, + 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc, + 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd, + } + }, + { 0x8701034, { + 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83, + 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d, + 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8, + 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b, + } + }, + { 0x8a00008, { + 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e, + 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e, + 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72, + 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c, + } + }, + { 0x8a0000a, { + 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c, + 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44, + 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb, + 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20, + } + }, + { 0xa00104c, { + 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe, + 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76, + 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b, + 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b, + } + }, + { 0xa00104e, { + 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2, + 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0, + 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e, + 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b, + } + }, + { 0xa001053, { + 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d, + 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25, + 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb, + 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3, + } + }, + { 0xa001058, { + 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36, + 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19, + 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec, + 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2, + } + }, + { 0xa001075, { + 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9, + 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a, + 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f, + 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0, + } + }, + { 0xa001078, { + 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25, + 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce, + 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04, + 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e, + } + }, + { 0xa001079, { + 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb, + 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93, + 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb, + 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a, + } + }, + { 0xa00107a, { + 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f, + 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd, + 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f, + 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18, + } + }, + { 0xa001143, { + 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80, + 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42, + 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d, + 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8, + } + }, + { 0xa001144, { + 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41, + 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b, + 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25, + 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc, + } + }, + { 0xa00115d, { + 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd, + 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75, + 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e, + 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05, + } + }, + { 0xa001173, { + 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a, + 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35, + 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3, + 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e, + } + }, + { 0xa0011a8, { + 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b, + 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6, + 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4, + 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e, + } + }, + { 0xa0011ce, { + 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71, + 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86, + 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e, + 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a, + } + }, + { 0xa0011d1, { + 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e, + 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09, + 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5, + 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8, + } + }, + { 0xa0011d3, { + 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b, + 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6, + 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00, + 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26, + } + }, + { 0xa0011d5, { + 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13, + 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7, + 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62, + 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21, + } + }, + { 0xa0011d7, { + 0x35,0x07,0xcd,0x40,0x94,0xbc,0x81,0x6b, + 0xfc,0x61,0x56,0x1a,0xe2,0xdb,0x96,0x12, + 0x1c,0x1c,0x31,0xb1,0x02,0x6f,0xe5,0xd2, + 0xfe,0x1b,0x04,0x03,0x2c,0x8f,0x4c,0x36, + } + }, + { 0xa001223, { + 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8, + 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4, + 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15, + 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45, + } + }, + { 0xa001224, { + 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25, + 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad, + 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9, + 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a, + } + }, + { 0xa001227, { + 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad, + 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d, + 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9, + 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0, + } + }, + { 0xa001229, { + 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6, + 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75, + 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4, + 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d, + } + }, + { 0xa00122e, { + 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf, + 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15, + 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa, + 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10, + } + }, + { 0xa001231, { + 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e, + 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64, + 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b, + 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd, + } + }, + { 0xa001234, { + 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7, + 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc, + 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b, + 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a, + } + }, + { 0xa001236, { + 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78, + 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d, + 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b, + 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25, + } + }, + { 0xa001238, { + 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc, + 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a, + 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e, + 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59, + } + }, + { 0xa00123b, { + 0xef,0xa1,0x1e,0x71,0xf1,0xc3,0x2c,0xe2, + 0xc3,0xef,0x69,0x41,0x7a,0x54,0xca,0xc3, + 0x8f,0x62,0x84,0xee,0xc2,0x39,0xd9,0x28, + 0x95,0xa7,0x12,0x49,0x1e,0x30,0x71,0x72, + } + }, + { 0xa00820c, { + 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3, + 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63, + 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1, + 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2, + } + }, + { 0xa00820d, { + 0xf9,0x2a,0xc0,0xf4,0x9e,0xa4,0x87,0xa4, + 0x7d,0x87,0x00,0xfd,0xab,0xda,0x19,0xca, + 0x26,0x51,0x32,0xc1,0x57,0x91,0xdf,0xc1, + 0x05,0xeb,0x01,0x7c,0x5a,0x95,0x21,0xb7, + } + }, + { 0xa10113e, { + 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10, + 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0, + 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5, + 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53, + } + }, + { 0xa101144, { + 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26, + 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09, + 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc, + 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47, + } + }, + { 0xa101148, { + 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90, + 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f, + 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08, + 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4, + } + }, + { 0xa10114c, { + 0x9e,0xb6,0xa2,0xd9,0x87,0x38,0xc5,0x64, + 0xd8,0x88,0xfa,0x78,0x98,0xf9,0x6f,0x74, + 0x39,0x90,0x1b,0xa5,0xcf,0x5e,0xb4,0x2a, + 0x02,0xff,0xd4,0x8c,0x71,0x8b,0xe2,0xc0, + } + }, + { 0xa10123e, { + 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18, + 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d, + 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc, + 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b, + } + }, + { 0xa101244, { + 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c, + 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1, + 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72, + 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0, + } + }, + { 0xa101248, { + 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e, + 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22, + 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e, + 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75, + } + }, + { 0xa10124c, { + 0x29,0xea,0xf1,0x2c,0xb2,0xe4,0xef,0x90, + 0xa4,0xcd,0x1d,0x86,0x97,0x17,0x61,0x46, + 0xfc,0x22,0xcb,0x57,0x75,0x19,0xc8,0xcc, + 0x0c,0xf5,0xbc,0xac,0x81,0x9d,0x9a,0xd2, + } + }, + { 0xa108108, { + 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9, + 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6, + 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c, + 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16, + } + }, + { 0xa108109, { + 0x85,0xb4,0xbd,0x7c,0x49,0xa7,0xbd,0xfa, + 0x49,0x36,0x80,0x81,0xc5,0xb7,0x39,0x1b, + 0x9a,0xaa,0x50,0xde,0x9b,0xe9,0x32,0x35, + 0x42,0x7e,0x51,0x4f,0x52,0x2c,0x28,0x59, + } + }, + { 0xa20102d, { + 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11, + 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89, + 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23, + 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4, + } + }, + { 0xa20102e, { + 0xbe,0x1f,0x32,0x04,0x0d,0x3c,0x9c,0xdd, + 0xe1,0xa4,0xbf,0x76,0x3a,0xec,0xc2,0xf6, + 0x11,0x00,0xa7,0xaf,0x0f,0xe5,0x02,0xc5, + 0x54,0x3a,0x1f,0x8c,0x16,0xb5,0xff,0xbe, + } + }, + { 0xa201210, { + 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe, + 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9, + 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68, + 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41, + } + }, + { 0xa201211, { + 0x69,0xa1,0x17,0xec,0xd0,0xf6,0x6c,0x95, + 0xe2,0x1e,0xc5,0x59,0x1a,0x52,0x0a,0x27, + 0xc4,0xed,0xd5,0x59,0x1f,0xbf,0x00,0xff, + 0x08,0x88,0xb5,0xe1,0x12,0xb6,0xcc,0x27, + } + }, + { 0xa404107, { + 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45, + 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0, + 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81, + 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99, + } + }, + { 0xa404108, { + 0x69,0x67,0x43,0x06,0xf8,0x0c,0x62,0xdc, + 0xa4,0x21,0x30,0x4f,0x0f,0x21,0x2c,0xcb, + 0xcc,0x37,0xf1,0x1c,0xc3,0xf8,0x2f,0x19, + 0xdf,0x53,0x53,0x46,0xb1,0x15,0xea,0x00, + } + }, + { 0xa500011, { + 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4, + 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1, + 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2, + 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74, + } + }, + { 0xa500012, { + 0xeb,0x74,0x0d,0x47,0xa1,0x8e,0x09,0xe4, + 0x93,0x4c,0xad,0x03,0x32,0x4c,0x38,0x16, + 0x10,0x39,0xdd,0x06,0xaa,0xce,0xd6,0x0f, + 0x62,0x83,0x9d,0x8e,0x64,0x55,0xbe,0x63, + } + }, + { 0xa601209, { + 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32, + 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30, + 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46, + 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d, + } + }, + { 0xa60120a, { + 0x0c,0x8b,0x3d,0xfd,0x52,0x52,0x85,0x7d, + 0x20,0x3a,0xe1,0x7e,0xa4,0x21,0x3b,0x7b, + 0x17,0x86,0xae,0xac,0x13,0xb8,0x63,0x9d, + 0x06,0x01,0xd0,0xa0,0x51,0x9a,0x91,0x2c, + } + }, + { 0xa704107, { + 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6, + 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93, + 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2, + 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39, + } + }, + { 0xa704108, { + 0xd7,0x55,0x15,0x2b,0xfe,0xc4,0xbc,0x93, + 0xec,0x91,0xa0,0xae,0x45,0xb7,0xc3,0x98, + 0x4e,0xff,0x61,0x77,0x88,0xc2,0x70,0x49, + 0xe0,0x3a,0x1d,0x84,0x38,0x52,0xbf,0x5a, + } + }, + { 0xa705206, { + 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4, + 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7, + 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b, + 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc, + } + }, + { 0xa705208, { + 0x30,0x1d,0x55,0x24,0xbc,0x6b,0x5a,0x19, + 0x0c,0x7d,0x1d,0x74,0xaa,0xd1,0xeb,0xd2, + 0x16,0x62,0xf7,0x5b,0xe1,0x1f,0x18,0x11, + 0x5c,0xf0,0x94,0x90,0x26,0xec,0x69,0xff, + } + }, + { 0xa708007, { + 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3, + 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2, + 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80, + 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93, + } + }, + { 0xa708008, { + 0x08,0x6e,0xf0,0x22,0x4b,0x8e,0xc4,0x46, + 0x58,0x34,0xe6,0x47,0xa2,0x28,0xfd,0xab, + 0x22,0x3d,0xdd,0xd8,0x52,0x9e,0x1d,0x16, + 0xfa,0x01,0x68,0x14,0x79,0x3e,0xe8,0x6b, + } + }, + { 0xa70c005, { + 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b, + 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f, + 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55, + 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13, + } + }, + { 0xa70c008, { + 0x0f,0xdb,0x37,0xa1,0x10,0xaf,0xd4,0x21, + 0x94,0x0d,0xa4,0xa2,0xe9,0x86,0x6c,0x0e, + 0x85,0x7c,0x36,0x30,0xa3,0x3a,0x78,0x66, + 0x18,0x10,0x60,0x0d,0x78,0x3d,0x44,0xd0, + } + }, + { 0xaa00116, { + 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63, + 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5, + 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d, + 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9, + } + }, + { 0xaa00212, { + 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75, + 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71, + 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2, + 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1, + } + }, + { 0xaa00213, { + 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a, + 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f, + 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed, + 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b, + } + }, + { 0xaa00215, { + 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9, + 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4, + 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b, + 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef, + } + }, + { 0xaa00216, { + 0x79,0xfb,0x5b,0x9f,0xb6,0xe6,0xa8,0xf5, + 0x4e,0x7c,0x4f,0x8e,0x1d,0xad,0xd0,0x08, + 0xc2,0x43,0x7c,0x8b,0xe6,0xdb,0xd0,0xd2, + 0xe8,0x39,0x26,0xc1,0xe5,0x5a,0x48,0xf1, + } + }, +}; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b3658d11e7b6..68049f171860 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -17,8 +17,8 @@ #define pr_fmt(fmt) "microcode: " fmt -#include <linux/platform_device.h> #include <linux/stop_machine.h> +#include <linux/device/faux.h> #include <linux/syscore_ops.h> #include <linux/miscdevice.h> #include <linux/capability.h> @@ -37,15 +37,25 @@ #include <asm/perf_event.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/msr.h> #include <asm/setup.h> #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); -module_param(force_minrev, bool, S_IRUSR | S_IWUSR); + +/* + * Those below should be behind CONFIG_MICROCODE_DBG ifdeffery but in + * order to not uglify the code with ifdeffery and use IS_ENABLED() + * instead, leave them in. When microcode debugging is not enabled, + * those are meaningless anyway. + */ +/* base microcode revision for debugging */ +u32 base_rev; +u32 microcode_rev[NR_CPUS] = {}; /* * Synchronization. @@ -84,6 +94,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -95,29 +108,60 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; + if (!cpuid_feature() || + ((native_cpuid_ecx(1) & BIT(31)) && + !IS_ENABLED(CONFIG_MICROCODE_DBG)) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } + return dis_ucode_ldr; +} + +static void __init early_parse_cmdline(void) +{ + char cmd_buf[64] = {}; + char *s, *p = cmd_buf; + + if (cmdline_find_option(boot_command_line, "microcode", cmd_buf, sizeof(cmd_buf)) > 0) { + while ((s = strsep(&p, ","))) { + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) { + if (strstr(s, "base_rev=")) { + /* advance to the option arg */ + strsep(&s, "="); + if (kstrtouint(s, 16, &base_rev)) { ; } + } + } - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!strcmp("force_minrev", s)) + force_minrev = true; - return dis_ucode_ldr; + if (!strcmp(s, "dis_ucode_ldr")) + dis_ucode_ldr = true; + } + } + + /* old, compat option */ + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; } void __init load_ucode_bsp(void) @@ -125,7 +169,9 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + early_parse_cmdline(); + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -146,9 +192,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -159,6 +202,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -238,7 +286,7 @@ static void reload_early_microcode(unsigned int cpu) } /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +static struct faux_device *microcode_fdev; #ifdef CONFIG_MICROCODE_LATE_LOADING /* @@ -541,6 +589,17 @@ static int load_late_stop_cpus(bool is_safe) pr_err("You should switch to early loading, if possible.\n"); } + /* + * Pre-load the microcode image into a staging device. This + * process is preemptible and does not require stopping CPUs. + * Successful staging simplifies the subsequent late-loading + * process, reducing rendezvous time. + * + * Even if the transfer fails, the update will proceed as usual. + */ + if (microcode_ops->use_staging) + microcode_ops->stage_microcode(); + atomic_set(&late_cpus_in, num_online_cpus()); atomic_set(&offline_in_nmi, 0); loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000); @@ -679,13 +738,15 @@ static int load_late_locked(void) if (!setup_cpus()) return -EBUSY; - switch (microcode_ops->request_microcode_fw(0, µcode_pdev->dev)) { + switch (microcode_ops->request_microcode_fw(0, µcode_fdev->dev)) { case UCODE_NEW: return load_late_stop_cpus(false); case UCODE_NEW_SAFE: return load_late_stop_cpus(true); case UCODE_NFOUND: return -ENOENT; + case UCODE_OK: + return 0; default: return -EBADFD; } @@ -762,8 +823,17 @@ void microcode_bsp_resume(void) reload_early_microcode(cpu); } -static struct syscore_ops mc_syscore_ops = { - .resume = microcode_bsp_resume, +static void microcode_bsp_syscore_resume(void *data) +{ + microcode_bsp_resume(); +} + +static const struct syscore_ops mc_syscore_ops = { + .resume = microcode_bsp_syscore_resume, +}; + +static struct syscore mc_syscore = { + .ops = &mc_syscore_ops, }; static int mc_cpu_online(unsigned int cpu) @@ -810,7 +880,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) @@ -828,9 +898,9 @@ static int __init microcode_init(void) if (early_data.new_rev) pr_info_once("Updated early from: 0x%08x\n", early_data.old_rev); - microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); - if (IS_ERR(microcode_pdev)) - return PTR_ERR(microcode_pdev); + microcode_fdev = faux_device_create("microcode", NULL, NULL); + if (!microcode_fdev) + return -ENODEV; dev_root = bus_get_dev_root(&cpu_subsys); if (dev_root) { @@ -842,14 +912,14 @@ static int __init microcode_init(void) } } - register_syscore_ops(&mc_syscore_ops); + register_syscore(&mc_syscore); cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); return 0; out_pdev: - platform_device_unregister(microcode_pdev); + faux_device_destroy(microcode_fdev); return error; } diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h new file mode 100644 index 000000000000..2d48e6593540 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -0,0 +1,160 @@ +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003901 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002b01 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd000404 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002d0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xca }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x3c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x56 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000639 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c0003f7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x437 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xfa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x104 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0x102 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x64 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xad, .steppings = 0x0002, .driver_data = 0xa0000d1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaf, .steppings = 0x0008, .driver_data = 0x3000341 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb5, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0010, .driver_data = 0x12f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4128 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbd, .steppings = 0x0002, .driver_data = 0x11f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0040, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0080, .driver_data = 0x3a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc5, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xc6, .steppings = 0x0010, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xca, .steppings = 0x0004, .driver_data = 0x118 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x210002a9 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 }, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index f3d534807d91..8744f3adc2a0 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -13,12 +13,15 @@ #define pr_fmt(fmt) "microcode: " fmt #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/initrd.h> #include <linux/kernel.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/cpu.h> #include <linux/uio.h> +#include <linux/io.h> #include <linux/mm.h> #include <asm/cpu_device_id.h> @@ -33,6 +36,38 @@ static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; #define UCODE_BSP_LOADED ((struct microcode_intel *)0x1UL) +/* Defines for the microcode staging mailbox interface */ +#define MBOX_REG_NUM 4 +#define MBOX_REG_SIZE sizeof(u32) + +#define MBOX_CONTROL_OFFSET 0x0 +#define MBOX_STATUS_OFFSET 0x4 +#define MBOX_WRDATA_OFFSET 0x8 +#define MBOX_RDDATA_OFFSET 0xc + +#define MASK_MBOX_CTRL_ABORT BIT(0) +#define MASK_MBOX_CTRL_GO BIT(31) + +#define MASK_MBOX_STATUS_ERROR BIT(2) +#define MASK_MBOX_STATUS_READY BIT(31) + +#define MASK_MBOX_RESP_SUCCESS BIT(0) +#define MASK_MBOX_RESP_PROGRESS BIT(1) +#define MASK_MBOX_RESP_ERROR BIT(2) + +#define MBOX_CMD_LOAD 0x3 +#define MBOX_OBJ_STAGING 0xb +#define MBOX_HEADER(size) ((PCI_VENDOR_ID_INTEL) | \ + (MBOX_OBJ_STAGING << 16) | \ + ((u64)((size) / sizeof(u32)) << 32)) + +/* The size of each mailbox header */ +#define MBOX_HEADER_SIZE sizeof(u64) +/* The size of staging hardware response */ +#define MBOX_RESPONSE_SIZE sizeof(u64) + +#define MBOX_XACTION_TIMEOUT_MS (10 * MSEC_PER_SEC) + /* Current microcode patch used in early patching on the APs. */ static struct microcode_intel *ucode_patch_va __read_mostly; static struct microcode_intel *ucode_patch_late __read_mostly; @@ -54,6 +89,23 @@ struct extended_sigtable { struct extended_signature sigs[]; }; +/** + * struct staging_state - Track the current staging process state + * + * @mmio_base: MMIO base address for staging + * @ucode_len: Total size of the microcode image + * @chunk_size: Size of each data piece + * @bytes_sent: Total bytes transmitted so far + * @offset: Current offset in the microcode image + */ +struct staging_state { + void __iomem *mmio_base; + unsigned int ucode_len; + unsigned int chunk_size; + unsigned int bytes_sent; + unsigned int offset; +}; + #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) @@ -74,7 +126,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig) sig->pf = 0; sig->rev = intel_get_microcode_revision(); - if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) { + if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) { unsigned int val[2]; /* get processor flags from MSR 0x17 */ @@ -299,6 +351,298 @@ static __init struct microcode_intel *scan_microcode(void *data, size_t size, return size ? NULL : patch; } +static inline u32 read_mbox_dword(void __iomem *mmio_base) +{ + u32 dword = readl(mmio_base + MBOX_RDDATA_OFFSET); + + /* Acknowledge read completion to the staging hardware */ + writel(0, mmio_base + MBOX_RDDATA_OFFSET); + return dword; +} + +static inline void write_mbox_dword(void __iomem *mmio_base, u32 dword) +{ + writel(dword, mmio_base + MBOX_WRDATA_OFFSET); +} + +static inline u64 read_mbox_header(void __iomem *mmio_base) +{ + u32 high, low; + + low = read_mbox_dword(mmio_base); + high = read_mbox_dword(mmio_base); + + return ((u64)high << 32) | low; +} + +static inline void write_mbox_header(void __iomem *mmio_base, u64 value) +{ + write_mbox_dword(mmio_base, value); + write_mbox_dword(mmio_base, value >> 32); +} + +static void write_mbox_data(void __iomem *mmio_base, u32 *chunk, unsigned int chunk_bytes) +{ + int i; + + /* + * The MMIO space is mapped as Uncached (UC). Each write arrives + * at the device as an individual transaction in program order. + * The device can then reassemble the sequence accordingly. + */ + for (i = 0; i < chunk_bytes / sizeof(u32); i++) + write_mbox_dword(mmio_base, chunk[i]); +} + +/* + * Prepare for a new microcode transfer: reset hardware and record the + * image size. + */ +static void init_stage(struct staging_state *ss) +{ + ss->ucode_len = get_totalsize(&ucode_patch_late->hdr); + + /* + * Abort any ongoing process, effectively resetting the device. + * Unlike regular mailbox data processing requests, this + * operation does not require a status check. + */ + writel(MASK_MBOX_CTRL_ABORT, ss->mmio_base + MBOX_CONTROL_OFFSET); +} + +/* + * Update the chunk size and decide whether another chunk can be sent. + * This accounts for remaining data and retry limits. + */ +static bool can_send_next_chunk(struct staging_state *ss, int *err) +{ + /* A page size or remaining bytes if this is the final chunk */ + ss->chunk_size = min(PAGE_SIZE, ss->ucode_len - ss->offset); + + /* + * Each microcode image is divided into chunks, each at most + * one page size. A 10-chunk image would typically require 10 + * transactions. + * + * However, the hardware managing the mailbox has limited + * resources and may not cache the entire image, potentially + * requesting the same chunk multiple times. + * + * To tolerate this behavior, allow up to twice the expected + * number of transactions (i.e., a 10-chunk image can take up to + * 20 attempts). + * + * If the number of attempts exceeds this limit, treat it as + * exceeding the maximum allowed transfer size. + */ + if (ss->bytes_sent + ss->chunk_size > ss->ucode_len * 2) { + *err = -EMSGSIZE; + return false; + } + + *err = 0; + return true; +} + +/* + * The hardware indicates completion by returning a sentinel end offset. + */ +static inline bool is_end_offset(u32 offset) +{ + return offset == UINT_MAX; +} + +/* + * Determine whether staging is complete: either the hardware signaled + * the end offset, or no more transactions are permitted (retry limit + * reached). + */ +static inline bool staging_is_complete(struct staging_state *ss, int *err) +{ + return is_end_offset(ss->offset) || !can_send_next_chunk(ss, err); +} + +/* + * Wait for the hardware to complete a transaction. + * Return 0 on success, or an error code on failure. + */ +static int wait_for_transaction(struct staging_state *ss) +{ + u32 timeout, status; + + /* Allow time for hardware to complete the operation: */ + for (timeout = 0; timeout < MBOX_XACTION_TIMEOUT_MS; timeout++) { + msleep(1); + + status = readl(ss->mmio_base + MBOX_STATUS_OFFSET); + /* Break out early if the hardware is ready: */ + if (status & MASK_MBOX_STATUS_READY) + break; + } + + /* Check for explicit error response */ + if (status & MASK_MBOX_STATUS_ERROR) + return -EIO; + + /* + * Hardware has neither responded to the action nor signaled any + * error. Treat this as a timeout. + */ + if (!(status & MASK_MBOX_STATUS_READY)) + return -ETIMEDOUT; + + return 0; +} + +/* + * Transmit a chunk of the microcode image to the hardware. + * Return 0 on success, or an error code on failure. + */ +static int send_data_chunk(struct staging_state *ss, void *ucode_ptr) +{ + u32 *src_chunk = ucode_ptr + ss->offset; + u16 mbox_size; + + /* + * Write a 'request' mailbox object in this order: + * 1. Mailbox header includes total size + * 2. Command header specifies the load operation + * 3. Data section contains a microcode chunk + * + * Thus, the mailbox size is two headers plus the chunk size. + */ + mbox_size = MBOX_HEADER_SIZE * 2 + ss->chunk_size; + write_mbox_header(ss->mmio_base, MBOX_HEADER(mbox_size)); + write_mbox_header(ss->mmio_base, MBOX_CMD_LOAD); + write_mbox_data(ss->mmio_base, src_chunk, ss->chunk_size); + ss->bytes_sent += ss->chunk_size; + + /* Notify the hardware that the mailbox is ready for processing. */ + writel(MASK_MBOX_CTRL_GO, ss->mmio_base + MBOX_CONTROL_OFFSET); + + return wait_for_transaction(ss); +} + +/* + * Retrieve the next offset from the hardware response. + * Return 0 on success, or an error code on failure. + */ +static int fetch_next_offset(struct staging_state *ss) +{ + const u64 expected_header = MBOX_HEADER(MBOX_HEADER_SIZE + MBOX_RESPONSE_SIZE); + u32 offset, status; + u64 header; + + /* + * The 'response' mailbox returns three fields, in order: + * 1. Header + * 2. Next offset in the microcode image + * 3. Status flags + */ + header = read_mbox_header(ss->mmio_base); + offset = read_mbox_dword(ss->mmio_base); + status = read_mbox_dword(ss->mmio_base); + + /* All valid responses must start with the expected header. */ + if (header != expected_header) { + pr_err_once("staging: invalid response header (0x%llx)\n", header); + return -EBADR; + } + + /* + * Verify the offset: If not at the end marker, it must not + * exceed the microcode image length. + */ + if (!is_end_offset(offset) && offset > ss->ucode_len) { + pr_err_once("staging: invalid offset (%u) past the image end (%u)\n", + offset, ss->ucode_len); + return -EINVAL; + } + + /* Hardware may report errors explicitly in the status field */ + if (status & MASK_MBOX_RESP_ERROR) + return -EPROTO; + + ss->offset = offset; + return 0; +} + +/* + * Handle the staging process using the mailbox MMIO interface. The + * microcode image is transferred in chunks until completion. + * Return 0 on success or an error code on failure. + */ +static int do_stage(u64 mmio_pa) +{ + struct staging_state ss = {}; + int err; + + ss.mmio_base = ioremap(mmio_pa, MBOX_REG_NUM * MBOX_REG_SIZE); + if (WARN_ON_ONCE(!ss.mmio_base)) + return -EADDRNOTAVAIL; + + init_stage(&ss); + + /* Perform the staging process while within the retry limit */ + while (!staging_is_complete(&ss, &err)) { + /* Send a chunk of microcode each time: */ + err = send_data_chunk(&ss, ucode_patch_late); + if (err) + break; + /* + * Then, ask the hardware which piece of the image it + * needs next. The same piece may be sent more than once. + */ + err = fetch_next_offset(&ss); + if (err) + break; + } + + iounmap(ss.mmio_base); + + return err; +} + +static void stage_microcode(void) +{ + unsigned int pkg_id = UINT_MAX; + int cpu, err; + u64 mmio_pa; + + if (!IS_ALIGNED(get_totalsize(&ucode_patch_late->hdr), sizeof(u32))) { + pr_err("Microcode image 32-bit misaligned (0x%x), staging failed.\n", + get_totalsize(&ucode_patch_late->hdr)); + return; + } + + lockdep_assert_cpus_held(); + + /* + * The MMIO address is unique per package, and all the SMT + * primary threads are online here. Find each MMIO space by + * their package IDs to avoid duplicate staging. + */ + for_each_cpu(cpu, cpu_primary_thread_mask) { + if (topology_logical_package_id(cpu) == pkg_id) + continue; + + pkg_id = topology_logical_package_id(cpu); + + err = rdmsrq_on_cpu(cpu, MSR_IA32_MCU_STAGING_MBOX_ADDR, &mmio_pa); + if (WARN_ON_ONCE(err)) + return; + + err = do_stage(mmio_pa); + if (err) { + pr_err("Error: staging failed (%d) for CPU%d at package %u.\n", + err, cpu, pkg_id); + return; + } + } + + pr_info("Staging of patch revision 0x%x succeeded.\n", ucode_patch_late->hdr.rev); +} + static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, struct microcode_intel *mc, u32 *cur_rev) @@ -320,7 +664,7 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, } /* write microcode via MSR 0x79 */ - native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) @@ -389,7 +733,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); @@ -627,6 +971,7 @@ static struct microcode_ops microcode_intel_ops = { .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_late, .finalize_late_load = finalize_late_load, + .stage_microcode = stage_microcode, .use_nmi = IS_ENABLED(CONFIG_X86_64), }; @@ -638,6 +983,18 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) llc_size_per_core = (unsigned int)llc_size; } +static __init bool staging_available(void) +{ + u64 val; + + val = x86_read_arch_cap_msr(); + if (!(val & ARCH_CAP_MCU_ENUM)) + return false; + + rdmsrq(MSR_IA32_MCU_ENUMERATION, val); + return !!(val & MCU_STAGING); +} + struct microcode_ops * __init init_intel_microcode(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -648,6 +1005,11 @@ struct microcode_ops * __init init_intel_microcode(void) return NULL; } + if (staging_available()) { + microcode_intel_ops.use_staging = true; + pr_info("Enabled staging feature.\n"); + } + calc_llc_size_per_core(c); return µcode_intel_ops; diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 21776c529fa9..a10b547eda1e 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -31,10 +31,12 @@ struct microcode_ops { * See also the "Synchronization" section in microcode_core.c. */ enum ucode_state (*apply_microcode)(int cpu); + void (*stage_microcode)(void); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); void (*finalize_late_load)(int result); unsigned int nmi_safe : 1, - use_nmi : 1; + use_nmi : 1, + use_staging : 1; }; struct early_load_data { @@ -44,6 +46,9 @@ struct early_load_data { extern struct early_load_data early_data; extern struct ucode_cpu_info ucode_cpu_info[]; +extern u32 microcode_rev[NR_CPUS]; +extern u32 base_rev; + struct cpio_data find_microcode_in_initrd(const char *path); #define MAX_UCODE_COUNT 128 @@ -94,20 +99,17 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); -int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } -static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } static inline void exit_amd_microcode(void) { } @@ -125,4 +127,10 @@ static inline void reload_ucode_intel(void) { } static inline struct microcode_ops *init_intel_microcode(void) { return NULL; } #endif /* !CONFIG_CPU_SUP_INTEL */ +#define ucode_dbg(fmt, ...) \ +({ \ + if (IS_ENABLED(CONFIG_MICROCODE_DBG)) \ + pr_info(fmt, ##__VA_ARGS__); \ +}) + #endif /* _X86_MICROCODE_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d18078834ded..579fb2c64cfd 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -19,7 +19,7 @@ #include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/desc.h> #include <asm/idtentry.h> @@ -28,22 +28,23 @@ #include <asm/apic.h> #include <asm/timer.h> #include <asm/reboot.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> #include <asm/svm.h> -/* Is Linux running as the root partition? */ -bool hv_root_partition; /* Is Linux running on nested Microsoft Hypervisor */ bool hv_nested; struct ms_hyperv_info ms_hyperv; -/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ -bool hyperv_paravisor_present __ro_after_init; -EXPORT_SYMBOL_GPL(hyperv_paravisor_present); - #if IS_ENABLED(CONFIG_HYPERV) +/* + * When running with the paravisor, controls proxying the synthetic interrupts + * from the host + */ +static bool hv_para_sint_proxy; + static inline unsigned int hv_get_nested_msr(unsigned int reg) { if (hv_is_sint_msr(reg)) @@ -72,7 +73,7 @@ u64 hv_get_non_nested_msr(unsigned int reg) if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else - rdmsrl(reg, value); + rdmsrq(reg, value); return value; } EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); @@ -80,17 +81,51 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); void hv_set_non_nested_msr(unsigned int reg, u64 value) { if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { + /* The hypervisor will get the intercept. */ hv_ivm_msr_write(reg, value); - /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_msr(reg)) - wrmsrl(reg, value | 1 << 20); + /* Using wrmsrq so the following goes to the paravisor. */ + if (hv_is_sint_msr(reg)) { + union hv_synic_sint sint = { .as_uint64 = value }; + + sint.proxy = hv_para_sint_proxy; + native_wrmsrq(reg, sint.as_uint64); + } } else { - wrmsrl(reg, value); + native_wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); +/* + * Enable or disable proxying synthetic interrupts + * to the paravisor. + */ +void hv_para_set_sint_proxy(bool enable) +{ + hv_para_sint_proxy = enable; +} + +/* + * Get the SynIC register value from the paravisor. + */ +u64 hv_para_get_synic_register(unsigned int reg) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return ~0ULL; + return native_read_msr(reg); +} + +/* + * Set the SynIC register value with the paravisor. + */ +void hv_para_set_synic_register(unsigned int reg, u64 val) +{ + if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg))) + return; + native_write_msr(reg, val); +} + u64 hv_get_msr(unsigned int reg) { if (hv_nested) @@ -109,6 +144,7 @@ void hv_set_msr(unsigned int reg, u64 value) } EXPORT_SYMBOL_GPL(hv_set_msr); +static void (*mshv_handler)(void); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); @@ -119,6 +155,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) struct pt_regs *old_regs = set_irq_regs(regs); inc_irq_stat(irq_hv_callback_count); + if (mshv_handler) + mshv_handler(); + if (vmbus_handler) vmbus_handler(); @@ -128,6 +167,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } +void hv_setup_mshv_handler(void (*handler)(void)) +{ + mshv_handler = handler; +} + void hv_setup_vmbus_handler(void (*handler)(void)) { vmbus_handler = handler; @@ -211,7 +255,7 @@ static void hv_machine_shutdown(void) #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_CRASH_DUMP -static void hv_machine_crash_shutdown(struct pt_regs *regs) +static void hv_guest_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) hv_crash_handler(regs); @@ -223,8 +267,75 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hyperv_cleanup(); } #endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} + +#ifdef CONFIG_X86_64 +DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall); +EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall); +#define hypercall_update(hc) static_call_update(hv_hypercall, hc) +#endif #endif /* CONFIG_HYPERV */ +#ifndef hypercall_update +#define hypercall_update(hc) (void)hc +#endif + static uint32_t __init ms_hyperv_platform(void) { u32 eax; @@ -281,7 +392,7 @@ static unsigned long hv_get_tsc_khz(void) { unsigned long freq; - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); return freq / 1000; } @@ -365,10 +476,11 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) return 0; } +EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static void __init ms_hyperv_init_platform(void) { - int hv_max_functions_eax; + int hv_max_functions_eax, eax; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -379,13 +491,15 @@ static void __init ms_hyperv_init_platform(void) */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); - pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n", + ms_hyperv.features, ms_hyperv.priv_high, + ms_hyperv.ext_features, ms_hyperv.hints, ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -394,31 +508,29 @@ static void __init ms_hyperv_init_platform(void) pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); - /* - * Check CPU management privilege. - * - * To mirror what Windows does we should extract CPU management - * features and use the ReservedIdentityBit to detect if Linux is the - * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). For now, use the privilege - * flag as the indicator for running as root. - * - * Hyper-V should never specify running as root and as a Confidential - * VM. But to protect against a compromised/malicious Hyper-V trying - * to exploit root behavior to expose Confidential VM memory, ignore - * the root partition setting if also a Confidential VM. - */ - if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && - !(ms_hyperv.priv_high & HV_ISOLATION)) { - hv_root_partition = true; - pr_info("Hyper-V: running as root partition\n"); - } + hv_identify_partition_type(); + + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED; if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { hv_nested = true; pr_info("Hyper-V: running on a nested hypervisor\n"); } + /* + * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID + * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2 + * root) will not get them because the nested (L1) hypervisor filters them out. + * These are handled through intercept processing by the Windows Hyper-V stack + * or the paravisor. + */ + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + ms_hyperv.confidential_vmbus_available = + eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE; + ms_hyperv.msi_ext_dest_id = + eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; @@ -434,14 +546,14 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.shared_gpa_boundary = BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); - hyperv_paravisor_present = !!ms_hyperv.paravisor_present; - pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { static_branch_enable(&isolation_type_snp); + if (!ms_hyperv.paravisor_present) + hypercall_update(hv_snp_hypercall); } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { static_branch_enable(&isolation_type_tdx); @@ -449,6 +561,7 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; if (!ms_hyperv.paravisor_present) { + hypercall_update(hv_tdx_hypercall); /* * Mark the Hyper-V TSC page feature as disabled * in a TDX VM without paravisor so that the @@ -492,7 +605,7 @@ static void __init ms_hyperv_init_platform(void) */ u64 hv_lapic_frequency; - rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", @@ -508,13 +621,21 @@ static void __init ms_hyperv_init_platform(void) #endif #if IS_ENABLED(CONFIG_HYPERV) + if (hv_root_partition()) + machine_ops.power_off = hv_machine_power_off; #if defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; #endif #if defined(CONFIG_CRASH_DUMP) - machine_ops.crash_shutdown = hv_machine_crash_shutdown; + if (!hv_root_partition()) + machine_ops.crash_shutdown = hv_guest_crash_shutdown; #endif #endif + /* + * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root + * partition doesn't need to write to synthetic MSR to enable invariant + * TSC feature. It sees what the hardware provides. + */ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* * Writing to synthetic MSR 0x40000118 updates/changes the @@ -525,7 +646,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } @@ -561,7 +682,7 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition || + if (hv_root_partition() || (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif @@ -579,14 +700,19 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* * TSC should be marked as unstable only after Hyper-V * clocksource has been initialized. This ensures that the * stability of the sched_clock is not altered. + * + * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No + * need to check for it. */ - if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + if (!hv_root_partition() && + !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) mark_tsc_unstable("running on Hyper-V"); hardlockup_detector_disable(); @@ -608,21 +734,10 @@ static bool __init ms_hyperv_x2apic_available(void) * pci-hyperv host bridge. * * Note: for a Hyper-V root partition, this will always return false. - * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by - * default, they are implemented as intercepts by the Windows Hyper-V stack. - * Even a nested root partition (L2 root) will not get them because the - * nested (L1) hypervisor filters them out. */ static bool __init ms_hyperv_msi_ext_dest_id(void) { - u32 eax; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); - if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) - return false; - - eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); - return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; + return ms_hyperv.msi_ext_dest_id; } #ifdef CONFIG_AMD_MEM_ENCRYPT diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 18cf79d6e2c5..763534d77f59 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: LGPL-2.0+ /* * MTRR (Memory Type Range Register) cleanup * * Copyright (C) 2009 Yinghai Lu - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include <linux/init.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 7b29ebda024f..0863733858dc 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,9 +9,11 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/cc_platform.h> +#include <linux/string_choices.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> @@ -87,7 +89,6 @@ static int mtrr_state_set; u64 mtrr_tom2; struct mtrr_state_type mtrr_state; -EXPORT_SYMBOL_GPL(mtrr_state); /* Reserved bits in the high portion of the MTRRphysBaseN MSR. */ u32 phys_hi_rsvd; @@ -423,7 +424,7 @@ void __init mtrr_copy_map(void) } /** - * mtrr_overwrite_state - set static MTRR state + * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). @@ -436,8 +437,8 @@ void __init mtrr_copy_map(void) * @num_var: length of the @var array * @def_type: default caching type */ -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type) +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) { unsigned int i; @@ -591,7 +592,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } @@ -646,10 +647,10 @@ static void __init print_mtrr_state(void) pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_info("MTRR fixed ranges %sabled:\n", - ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? - "en" : "dis"); + pr_info("MTRR fixed ranges %s:\n", + str_enabled_disabled( + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED))); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -661,8 +662,8 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_info("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + pr_info("MTRR variable ranges %s:\n", + str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { @@ -1025,8 +1026,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ - if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7f..4049235b1bfe 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { diff --git a/arch/x86/kernel/cpu/mtrr/legacy.c b/arch/x86/kernel/cpu/mtrr/legacy.c index d25882fcf181..2415ffaaf02c 100644 --- a/arch/x86/kernel/cpu/mtrr/legacy.c +++ b/arch/x86/kernel/cpu/mtrr/legacy.c @@ -41,7 +41,7 @@ struct mtrr_value { static struct mtrr_value *mtrr_value; -static int mtrr_save(void) +static int mtrr_save(void *data) { int i; @@ -56,7 +56,7 @@ static int mtrr_save(void) return 0; } -static void mtrr_restore(void) +static void mtrr_restore(void *data) { int i; @@ -69,11 +69,15 @@ static void mtrr_restore(void) } } -static struct syscore_ops mtrr_syscore_ops = { +static const struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; +static struct syscore mtrr_syscore = { + .ops = &mtrr_syscore_ops, +}; + void mtrr_register_syscore(void) { mtrr_value = kcalloc(num_var_ranges, sizeof(*mtrr_value), GFP_KERNEL); @@ -86,5 +90,5 @@ void mtrr_register_syscore(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - register_syscore_ops(&mtrr_syscore_ops); + register_syscore(&mtrr_syscore); } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 989d368be04f..4b3d492afe17 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: LGPL-2.0+ /* Generic MTRR (Memory Type Range Register) driver. Copyright (C) 1997-2000 Richard Gooch Copyright (c) 2002 Patrick Mochel - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - Richard Gooch may be reached by email at rgooch@atnf.csiro.au The postal address is: Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. @@ -625,7 +612,7 @@ void mtrr_save_state(void) static int __init mtrr_init_finalize(void) { /* - * Map might exist if mtrr_overwrite_state() has been called or if + * Map might exist if guest_force_mtrr_state() has been called or if * mtrr_enabled() returns true. */ mtrr_copy_map(); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5655f253d929..2de3bd2f95d1 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -46,10 +46,6 @@ struct set_mtrr_context { u32 ccr3; }; -void set_mtrr_done(struct set_mtrr_context *ctxt); -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); - void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); bool get_mtrr_state(void); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 41ed01f46bd9..6571d432cbe3 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = arch_freq_get_on_cpu(cpu); + int freq = arch_freq_get_on_cpu(cpu); - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + if (freq < 0) + seq_puts(m, "cpu MHz\t\t: Unknown\n"); + else + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 4a06c37b9cf1..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o -obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o +obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o +obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b681c2e07dbf..3792ab4819dc 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,6 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -44,12 +45,6 @@ static DEFINE_MUTEX(domain_list_lock); DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state); /* - * Used to store the max resource name width and max resource data width - * to display the schemata in a tabular format - */ -int max_name_width, max_data_width; - -/* * Global boolean for rdt_alloc which is true if any * resource allocation is enabled. */ @@ -62,19 +57,16 @@ static void mba_wrmsr_amd(struct msr_param *m); #define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains) #define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains) -struct rdt_hw_resource rdt_resources_all[] = { +struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", .ctrl_scope = RESCTRL_L3_CACHE, .mon_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3), .mon_domains = mon_domain_init(RDT_RESOURCE_L3), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L3_CBM_BASE, .msr_update = cat_wrmsr, @@ -82,13 +74,10 @@ struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L2_CBM_BASE, .msr_update = cat_wrmsr, @@ -96,25 +85,19 @@ struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, }; @@ -124,7 +107,15 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &rdt_resources_all[l].r_resctrl; } /* @@ -151,17 +142,16 @@ static inline void cache_alloc_hsw_probe(void) struct rdt_resource *r = &hw_res->r_resctrl; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) + if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); + rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; - r->default_ctrl = max_cbm; r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; @@ -171,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -217,7 +192,7 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; - r->default_ctrl = MAX_MBA_BW; + r->membw.max_bw = MAX_MBA_BW; r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; @@ -228,13 +203,11 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r) return false; r->membw.arch_needs_linear = false; } - r->data_width = 3; if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; else r->membw.throttle_mode = THREAD_THROTTLE_MAX; - thread_throttle_mode_init(); r->alloc_capable = true; @@ -254,7 +227,7 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; - r->default_ctrl = 1 << eax; + r->membw.max_bw = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -267,8 +240,6 @@ static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; - /* Max value is 2048, Data width should be 4 in decimal */ - r->data_width = 4; r->alloc_capable = true; @@ -281,14 +252,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) union cpuid_0x10_1_eax eax; union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx; + u32 ebx, default_ctrl; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; - r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; - r->cache.shareable_bits = ebx & r->default_ctrl; - r->data_width = (r->cache.cbm_len + 3) / 4; + default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & default_ctrl; if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; @@ -304,6 +274,11 @@ static void rdt_get_cdp_config(int level) rdt_resources_all[level].r_resctrl.cdp_capable = true; } +static void rdt_set_io_alloc_capable(struct rdt_resource *r) +{ + r->cache.io_alloc_capable = true; +} + static void rdt_get_cdp_l3_config(void) { rdt_get_cdp_config(RDT_RESOURCE_L3); @@ -321,7 +296,7 @@ static void mba_wrmsr_amd(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -335,7 +310,7 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); - return r->default_ctrl; + return MAX_MBA_BW; } static void mba_wrmsr_intel(struct msr_param *m) @@ -346,7 +321,7 @@ static void mba_wrmsr_intel(struct msr_param *m) /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); + wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } static void cat_wrmsr(struct msr_param *m) @@ -356,37 +331,7 @@ static void cat_wrmsr(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); -} - -struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } u32 resctrl_arch_get_num_closid(struct rdt_resource *r) @@ -403,36 +348,6 @@ void rdt_ctrl_update(void *arg) hw_res->msr_update(m); } -/* - * rdt_find_domain - Search for a domain id in a resource domain list. - * - * Search the domain list to find the domain id. If the domain id is - * found, return the domain. NULL otherwise. If the domain id is not - * found (and NULL returned) then the first domain with id bigger than - * the input id can be returned to the caller via @pos. - */ -struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -444,7 +359,7 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = r->default_ctrl; + *dc = resctrl_get_default_ctrl(r); } static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) @@ -455,8 +370,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -490,25 +407,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (is_mbm_total_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (is_mbm_local_enabled()) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) @@ -543,7 +462,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos); + hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) return; @@ -588,6 +507,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; + struct cacheinfo *ci; int err; lockdep_assert_held(&domain_list_lock); @@ -598,13 +518,16 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->mon_domains, id, &add_pos); + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); if (hdr) { if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) return; d = container_of(hdr, struct rdt_mon_domain, hdr); cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); return; } @@ -615,17 +538,22 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!d->ci) { + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); mon_domain_free(hw_dom); return; } + d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } @@ -663,7 +591,7 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->ctrl_domains, id, NULL); + hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL); if (!hdr) { pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n", id, cpu, r->name); @@ -709,7 +637,7 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - hdr = rdt_find_domain(&r->mon_domains, id, NULL); + hdr = resctrl_find_domain(&r->mon_domains, id, NULL); if (!hdr) { pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n", id, cpu, r->name); @@ -784,20 +712,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } -/* - * Choose a width for the resource name and resource data based on the - * resource that has widest name and cbm. - */ -static __init void rdt_init_padding(void) -{ - struct rdt_resource *r; - - for_each_alloc_capable_rdt_resource(r) { - if (r->data_width > max_data_width) - max_data_width = r->data_width; - } -} - enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -809,6 +723,8 @@ enum { RDT_FLAG_MBA, RDT_FLAG_SMBA, RDT_FLAG_BMEC, + RDT_FLAG_ABMC, + RDT_FLAG_SDCIAE, }; #define RDT_OPT(idx, n, f) \ @@ -823,7 +739,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -834,6 +750,8 @@ static struct rdt_options rdt_options[] __initdata = { RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), + RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), + RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -863,7 +781,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -883,6 +801,21 @@ bool __init rdt_cpu_has(int flag) return ret; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + if (!rdt_cpu_has(X86_FEATURE_BMEC)) + return false; + + switch (evt) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL); + case QOS_L3_MBM_LOCAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL); + default: + return false; + } +} + static __init bool get_mem_config(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA]; @@ -927,6 +860,8 @@ static __init bool get_rdt_alloc_resources(void) rdt_get_cache_alloc_cfg(1, r); if (rdt_cpu_has(X86_FEATURE_CDP_L3)) rdt_get_cdp_l3_config(); + if (rdt_cpu_has(X86_FEATURE_SDCIAE)) + rdt_set_io_alloc_capable(r); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { @@ -950,15 +885,24 @@ static __init bool get_rdt_alloc_resources(void) static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + bool ret = false; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) - rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) - rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + ret = true; + } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; - if (!rdt_mon_features) + if (!ret) return false; return !rdt_get_mon_l3_config(r); @@ -1052,7 +996,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -1064,7 +1008,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ @@ -1079,10 +1024,14 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) } } -static int __init resctrl_late_init(void) +static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different @@ -1095,8 +1044,6 @@ static int __init resctrl_late_init(void) if (!get_rdt_resources()) return -ENODEV; - rdt_init_padding(); - state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", resctrl_arch_online_cpu, @@ -1104,7 +1051,7 @@ static int __init resctrl_late_init(void) if (state < 0) return state; - ret = rdtgroup_init(); + ret = resctrl_init(); if (ret) { cpuhp_remove_state(state); return ret; @@ -1120,18 +1067,13 @@ static int __init resctrl_late_init(void) return 0; } -late_initcall(resctrl_late_init); +late_initcall(resctrl_arch_late_init); -static void __exit resctrl_exit(void) +static void __exit resctrl_arch_exit(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - cpuhp_remove_state(rdt_online); - rdtgroup_exit(); - - if (r->mon_capable) - rdt_put_mon_l3_config(); + resctrl_exit(); } -__exitcall(resctrl_exit); +__exitcall(resctrl_arch_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 200d89a64027..b20e705606b8 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,273 +16,15 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - int ret; - u32 bw; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtou32(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Invalid MB value %s\n", buf); - return false; - } - - /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { - *data = bw; - return true; - } - - if (bw < r->membw.min_bw || bw > r->default_ctrl) { - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - u32 bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long first_bit, zero_bit, val; - unsigned int cbm_len = r->cache.cbm_len; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - struct rdt_ctrl_domain *d; - char *dom = NULL, *id; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (r->parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - -static u32 get_config_index(u32 closid, enum resctrl_conf_type type) -{ - switch (type) { - default: - case CDP_NONE: - return closid; - case CDP_CODE: - return closid * 2 + 1; - case CDP_DATA: - return closid * 2; - } -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - u32 idx = get_config_index(closid, t); + u32 idx = resctrl_get_config_index(closid, t); struct msr_param msr_param; if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) @@ -319,7 +61,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) if (!cfg->have_new_ctrl) continue; - idx = get_config_index(closid, t); + idx = resctrl_get_config_index(closid, t); if (cfg->new_ctrl == hw_dom->ctrl_val[idx]) continue; hw_dom->ctrl_val[idx] = cfg->new_ctrl; @@ -341,287 +83,51 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); - u32 idx = get_config_index(closid, type); + u32 idx = resctrl_get_config_index(closid, type); return hw_dom->ctrl_val[idx]; } -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_ctrl_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, r->format_str, dom->hdr.id, max_data_width, - ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) { - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->hdr.id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; + return resctrl_to_arch_res(r)->sdciae_enabled; } -static int smp_mon_event_count(void *arg) +static void resctrl_sdciae_set_one_amd(void *arg) { - mon_event_count(arg); + bool *enable = arg; - return 0; + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); } -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) +static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable) { - int cpu; + struct rdt_ctrl_domain *d; - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + /* Walking r->ctrl_domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(cpumask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */ + list_for_each_entry(d, &r->ctrl_domains, hdr.list) + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1); } -int rdtgroup_mondata_show(struct seq_file *m, void *arg) +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) { - struct kernfs_open_file *of = m->private; - struct rdt_domain_hdr *hdr; - struct rmid_read rr = {0}; - struct rdt_mon_domain *d; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - r = &rdt_resources_all[resid].r_resctrl; + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - if (md.u.sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * mon_data_bits union. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; - mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); - goto checkresult; - } - } - ret = -ENOENT; - goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = rdt_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + if (hw_res->r_resctrl.cache.io_alloc_capable && + hw_res->sdciae_enabled != enable) { + _resctrl_sdciae_enable(r, enable); + hw_res->sdciae_enabled = enable; } -checkresult: - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; + return 0; } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 955999aecfca..4a916c84a322 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -32,341 +25,6 @@ */ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) -/* Reads to Local DRAM Memory */ -#define READS_TO_LOCAL_MEM BIT(0) - -/* Reads to Remote DRAM Memory */ -#define READS_TO_REMOTE_MEM BIT(1) - -/* Non-Temporal Writes to Local Memory */ -#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) - -/* Non-Temporal Writes to Remote Memory */ -#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) - -/* Reads to Local Memory the system identifies as "Slow Memory" */ -#define READS_TO_LOCAL_S_MEM BIT(4) - -/* Reads to Remote Memory the system identifies as "Slow Memory" */ -#define READS_TO_REMOTE_S_MEM BIT(5) - -/* Dirty Victims to All Types of Memory */ -#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) - -/* Max event bits supported */ -#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) - -/** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - /* Only continue if tick_nohz_full_mask has been initialized. */ - if (!tick_nohz_full_enabled()) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file. - * @priv: Used to store monitoring event data in @u - * as kernfs private data. - * @u.rid: Resource id associated with the event file. - * @u.evtid: Event id associated with the event file. - * @u.sum: Set when event must be summed across multiple - * domains. - * @u.domid: When @u.sum is zero this is the domain to which - * the event file belongs. When @sum is one this - * is the id of the L3 cache that all domains to be - * summed share. - * @u: Name of the bit fields struct. - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 7; - unsigned int sum : 1; - unsigned int domid : 14; - } u; -}; - -/** - * struct rmid_read - Data passed across smp_call*() to read event count. - * @rgrp: Resource group for which the counter is being read. If it is a parent - * resource group then its event count is summed with the count from all - * its child resource groups. - * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. - * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. - * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). - */ -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; - bool first; - struct cacheinfo *ci; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern unsigned int rdt_mon_features; -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct pseudo_lock_region - pseudo-lock region information - * @s: Resctrl schema for the resource to which this - * pseudo-locked region belongs - * @d: RDT domain to which this pseudo-locked region - * belongs - * @cbm: bitmask of the pseudo-locked region - * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread - * completion - * @thread_done: variable used by waitqueue to test if pseudo-locking - * thread completed - * @cpu: core associated with the cache on which the setup code - * will be run - * @line_size: size of the cache lines - * @size: size of pseudo-locked region in bytes - * @kmem: the kernel memory associated with pseudo-locked region - * @minor: minor number of character device associated with this - * region - * @debugfs_dir: pointer to this region's directory in the debugfs - * filesystem - * @pm_reqs: Power management QoS requests related to this region - */ -struct pseudo_lock_region { - struct resctrl_schema *s; - struct rdt_ctrl_domain *d; - u32 cbm; - wait_queue_head_t lock_thread_wq; - int thread_done; - int cpu; - unsigned int line_size; - unsigned int size; - void *kmem; - unsigned int minor; - struct dentry *debugfs_dir; - struct list_head pm_reqs; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); -void __exit rdtgroup_exit(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - /** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. @@ -379,6 +37,18 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + +/* + * Qos Event Identifiers. + */ +#define ABMC_EXTENDED_EVT_ID BIT(31) +#define ABMC_EVT_ID BIT(0) + +/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */ +#define SDCIAE_ENABLE_BIT 1 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -396,15 +66,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) @@ -431,37 +101,6 @@ struct msr_param { u32 high; }; -static inline bool is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - -static inline bool is_mbm_enabled(void) -{ - return (is_mbm_total_enabled() || is_mbm_local_enabled()); -} - -static inline bool is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /** * struct rdt_hw_resource - arch private attributes of a resctrl resource * @r_resctrl: Attributes of the resource used directly by resctrl. @@ -474,9 +113,9 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth - * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled + * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled. * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -489,8 +128,9 @@ struct rdt_hw_resource { void (*msr_update)(struct msr_param *m); unsigned int mon_scale; unsigned int mbm_width; - unsigned int mbm_cfg_mask; bool cdp_enabled; + bool mbm_cntr_assign_enabled; + bool sdciae_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -498,65 +138,10 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d); -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - -extern struct mutex rdtgroup_mutex; - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; - -enum resctrl_res_level { - RDT_RESOURCE_L3, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - RDT_RESOURCE_SMBA, - - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res); - - hw_res++; - return &hw_res->r_resctrl; -} - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); -/* - * To return the common struct rdt_resource, which is contained in struct - * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. - */ -#define for_each_rdt_resource(r) \ - for (r = &rdt_resources_all[0].r_resctrl; \ - r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \ - r = resctrl_inc(r)) - -#define for_each_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable || r->mon_capable) - -#define for_each_alloc_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable) - -#define for_each_mon_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_capable) - /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { struct { @@ -590,67 +175,51 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); +/* + * ABMC counters are configured by writing to MSR_IA32_L3_QOS_ABMC_CFG. + * + * @bw_type : Event configuration that represents the memory + * transactions being tracked by the @cntr_id. + * @bw_src : Bandwidth source (RMID or CLOSID). + * @reserved1 : Reserved. + * @is_clos : @bw_src field is a CLOSID (not an RMID). + * @cntr_id : Counter identifier. + * @reserved : Reserved. + * @cntr_en : Counting enable bit. + * @cfg_en : Configuration enable bit. + * + * Configuration and counting: + * Counter can be configured across multiple writes to MSR. Configuration + * is applied only when @cfg_en = 1. Counter @cntr_id is reset when the + * configuration is applied. + * @cfg_en = 1, @cntr_en = 0 : Apply @cntr_id configuration but do not + * count events. + * @cfg_en = 1, @cntr_en = 1 : Apply @cntr_id configuration and start + * counting events. + */ +union l3_qos_abmc_cfg { + struct { + unsigned long bw_type :32, + bw_src :12, + reserved1: 3, + is_clos : 1, + cntr_id : 5, + reserved : 9, + cntr_en : 1, + cfg_en : 1; + } split; + unsigned long full; +}; void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, - struct list_head **pos); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); -struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); + int rdt_get_mon_l3_config(struct rdt_resource *r); -void __exit rdt_put_mon_l3_config(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); + +bool rdt_cpu_has(int flag); + void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); + void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void __init thread_throttle_mode_init(void); -void __init mbm_config_rftype_init(const char *config); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 5fcb3d635d91..dffcc8307500 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -18,62 +18,12 @@ #define pr_fmt(fmt) "resctrl: " fmt #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" -#include "trace.h" - -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; /* * Global boolean for rdt_monitor which is true if any @@ -81,28 +31,12 @@ static struct rmid_entry *rmid_ptrs; */ bool rdt_mon_capable; -/* - * Global to indicate which monitoring events are enabled. - */ -unsigned int rdt_mon_features; - -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -151,6 +85,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -163,33 +98,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. - * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. - */ -static inline struct rmid_entry *__rmid_entry(u32 idx) -{ - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); - - return entry; -} - -/* * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is * needed. The physical RMID is the same as the logical RMID. @@ -222,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -238,7 +146,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); + rdmsrq(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) return -EIO; @@ -253,19 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - } - /* Never expect to get here */ - WARN_ON_ONCE(1); + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; - return NULL; + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -294,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + enum resctrl_event_id eventid; + int idx; - if (is_mbm_total_enabled()) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (is_mbm_local_enabled()) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -312,24 +217,13 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; - u64 msr_val, chunks; - u32 prmid; - int ret; - - resctrl_arch_rmid_read_context_check(); - - prmid = logical_rmid_to_physical_rmid(cpu, rmid); - ret = __rmid_read_phys(prmid, eventid, &msr_val); - if (ret) - return ret; + u64 chunks; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { @@ -341,756 +235,103 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, chunks = msr_val; } - *val = chunks * hw_res->mon_scale; - - return 0; -} - -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - -/* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID - */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - - /* - * x86's CLOSID and RMID are independent numbers, so the entry's - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't - * used to select the configuration. It is thus necessary to track both - * CLOSID and RMID because there may be dependencies between them - * on some architectures. - */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; + return chunks * hw_res->mon_scale; } -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_mon_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (!resctrl_arch_mon_capable() || - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - int cpu = smp_processor_id(); - struct rdt_mon_domain *d; - struct mbm_state *m; - int err, ret; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; - } - - /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) - return -EINVAL; - - /* - * Legacy files must report the sum of an event across all - * domains that share the same L3 cache instance. - * Report success if a read from any domain succeeds, -EINVAL - * (translated to "Unavailable" for user space) if reading from - * all domains fail for any reason. - */ - ret = -EINVAL; - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) - continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (!err) { - rr->val += tval; - ret = 0; - } - } - - if (ret) - rr->err = ret; - - return ret; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct mbm_state *m = &rr->d->mbm_local[idx]; - u64 cur_bw, bytes, cur_bytes; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} - -/* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. - */ -void mon_event_count(void *info) +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); + struct arch_mbm_state *am; + u64 msr_val; + u32 prmid; int ret; - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_ctrl_domain *dom_mba; - struct rdt_resource *r_mba; - u32 cur_bw, user_bw, idx; - struct list_head *head; - struct rdtgroup *entry; - - if (!is_mbm_local_enabled()) - return; - - r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - idx = resctrl_arch_rmid_idx_encode(closid, rmid); - pmbm_data = &dom_mbm->mbm_local[idx]; - - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) -{ - struct rmid_read rr = {0}; - - rr.r = r; - rr.d = d; - - /* - * This is protected from concurrent reads from user - * as both the user and we hold the global mutex. - */ - if (is_mbm_total_enabled()) { - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } - if (is_mbm_local_enabled()) { - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * Call the MBA software controller only for the - * control groups and when user has enabled - * the software controller explicitly. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + resctrl_arch_rmid_read_context_check(); - __check_limbo(d, false); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); + if (!ret) { + *val = get_corrected_val(r, d, rmid, eventid, msr_val); + } else if (ret == -EINVAL) { + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) + am->prev_msr = 0; } - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + return ret; } -void mbm_handle_overflow(struct work_struct *work) +static int __cntr_id_read(u32 cntr_id, u64 *val) { - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; - struct list_head *head; - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_mon_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } + u64 msr_val; /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. + * QM_EVTSEL Register definition: + * ======================================================= + * Bits Mnemonic Description + * ======================================================= + * 63:44 -- Reserved + * 43:32 RMID RMID or counter ID in ABMC mode + * when reading an MBM event + * 31 ExtendedEvtID Extended Event Identifier + * 30:8 -- Reserved + * 7:0 EvtID Event Identifier + * ======================================================= + * The contents of a specific counter can be read by setting the + * following fields in QM_EVTSEL.ExtendedEvtID(=1) and + * QM_EVTSEL.EvtID = L3CacheABMC (=1) and setting QM_EVTSEL.RMID + * to the desired counter ID. Reading the QM_CTR then returns the + * contents of the specified counter. The RMID_VAL_ERROR bit is set + * if the counter configuration is invalid, or if an invalid counter + * ID is set in the QM_EVTSEL.RMID field. The RMID_VAL_UNAVAIL bit + * is set if the counter data is unavailable. */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; + wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id); + rdmsrl(MSR_IA32_QM_CTR, msr_val); - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + *val = msr_val; + return 0; } -static int dom_data_init(struct rdt_resource *r) +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) { - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct arch_mbm_state *am; - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); + am = get_arch_mbm_state(hw_dom, rmid, eventid); + if (am) { + memset(am, 0, sizeof(*am)); - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); + /* Record any initial, non-zero count value. */ + __cntr_id_read(cntr_id, &am->prev_msr); } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in rdtgroup_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; } -static void __exit dom_data_exit(void) +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) { - mutex_lock(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - - kfree(rmid_ptrs); - rmid_ptrs = NULL; - - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; + u64 msr_val; + int ret; -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; + ret = __cntr_id_read(cntr_id, &msr_val); + if (ret) + return ret; -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; + *val = get_corrected_val(r, d, rmid, eventid, msr_val); -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + return 0; } /* @@ -1120,6 +361,7 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, 0), {} }; @@ -1181,13 +423,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; - int ret; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -1202,7 +444,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -1211,39 +453,33 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - ret = dom_data_init(r); - if (ret) - return ret; - - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; - - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - mbm_total_event.configurable = true; - mbm_config_rftype_init("mbm_total_bytes_config"); - } - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - mbm_local_event.configurable = true; - mbm_config_rftype_init("mbm_local_bytes_config"); - } + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } - l3_mon_evt_init(r); + /* + * resctrl assumes a system that supports assignable counters can + * switch to "default" mode. Ensure that there is a "default" mode + * to switch to. This enforces a dependency between the independent + * X86_FEATURE_ABMC and X86_FEATURE_CQM_MBM_TOTAL/X86_FEATURE_CQM_MBM_LOCAL + * hardware features. + */ + if (rdt_cpu_has(X86_FEATURE_ABMC) && + (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL) || + rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + hw_res->mbm_cntr_assign_enabled = true; + } r->mon_capable = true; return 0; } -void __exit rdt_put_mon_l3_config(void) -{ - dom_data_exit(); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; @@ -1257,3 +493,91 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} + +void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + resctrl_abmc_set_one_amd(&hw_res->mbm_cntr_assign_enabled); +} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 972e6b6b0481..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,26 +11,22 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "internal.h" #define CREATE_TRACE_POINTS -#include "trace.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -38,30 +34,9 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** - * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported + * platforms * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support @@ -75,14 +50,16 @@ static const struct class pseudo_lock_class = { * in the SDM. * * When adding a platform here also add support for its cache events to - * measure_cycles_perf_fn() + * resctrl_arch_measure_l*_residency() * * Return: * If platform is supported, the bits to disable hardware prefetchers, 0 * if platform is not supported. */ -static u64 get_prefetch_disable_bits(void) +u64 resctrl_arch_get_prefetch_disable_bits(void) { + prefetch_disable_bits = 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) return 0; @@ -98,7 +75,8 @@ static u64 get_prefetch_disable_bits(void) * 3 DCU IP Prefetcher Disable (R/W) * 63:4 Reserved */ - return 0xF; + prefetch_disable_bits = 0xF; + break; case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_PLUS: /* @@ -109,307 +87,16 @@ static u64 get_prefetch_disable_bits(void) * 2 DCU Hardware Prefetcher Disable (R/W) * 63:3 Reserved */ - return 0x5; - } - - return 0; -} - -/** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - enum resctrl_scope scope = plr->s->res->ctrl_scope; - struct cacheinfo *ci; - int ret; - - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) - return -ENODEV; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo_level(plr->cpu, scope); - if (ci) { - plr->line_size = ci->coherency_line_size; - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - return 0; - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; + prefetch_disable_bits = 0x5; + break; } - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; + return prefetch_disable_bits; } /** - * pseudo_lock_fn - Load kernel memory into cache - * @_rdtgrp: resource group to which pseudo-lock region belongs + * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache + * @_plr: the pseudo-lock region descriptor * * This is the core pseudo-locking flow. * @@ -426,10 +113,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int pseudo_lock_fn(void *_rdtgrp) +int resctrl_arch_pseudo_lock_fn(void *_plr) { - struct rdtgroup *rdtgrp = _rdtgrp; - struct pseudo_lock_region *plr = rdtgrp->plr; + struct pseudo_lock_region *plr = _plr; u32 rmid_p, closid_p; unsigned long i; u64 saved_msr; @@ -459,7 +145,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * increase likelihood that allocated cache portion will be filled * with associated memory. */ - native_wbinvd(); + wbinvd(); /* * Always called with interrupts enabled. By disabling interrupts @@ -476,8 +162,8 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ - saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); + native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); mem_r = plr->kmem; @@ -489,7 +175,8 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -525,10 +212,10 @@ static int pseudo_lock_fn(void *_rdtgrp) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -537,342 +224,8 @@ static int pseudo_lock_fn(void *_rdtgrp) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - prefetch_disable_bits = get_prefetch_disable_bits(); - if (prefetch_disable_bits == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - struct rdt_ctrl_domain *d_i; - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->hdr.cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** - * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read + * pseudo-locked memory * @_plr: pseudo-lock region to measure * * There is no deterministic way to test if a memory region is cached. One @@ -885,7 +238,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int measure_cycles_lat_fn(void *_plr) +int resctrl_arch_measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; u32 saved_low, saved_high; @@ -898,7 +251,7 @@ static int measure_cycles_lat_fn(void *_plr) * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed @@ -994,7 +347,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); /* Initialize rest of local variables */ /* @@ -1012,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * used in L1 cache, second to capture accurate value that does not * include cache misses incurred because of instruction loads. */ - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * From SDM: Performing back-to-back fast reads are not guaranteed * to be monotonic. @@ -1021,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1044,8 +397,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_after); - rdpmcl(miss_pmcnum, miss_after); + hits_after = rdpmc(hit_pmcnum); + miss_after = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1069,7 +422,7 @@ out: return 0; } -static int measure_l2_residency(void *_plr) +int resctrl_arch_measure_l2_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1107,7 +460,7 @@ out: return 0; } -static int measure_l3_residency(void *_plr) +int resctrl_arch_measure_l3_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1162,440 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->hdr.cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_create_on_node(measure_cycles_lat_fn, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else if (sel == 2) - thread = kthread_create_on_node(measure_l2_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else if (sel == 3) - thread = kthread_create_on_node(measure_l3_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - kthread_bind(thread, cpu); - wake_up_process(thread); - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - - plr->thread_done = 0; - - thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, - cpu_to_node(plr->cpu), - "pseudo_lock/%u", plr->cpu); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - kthread_bind(thread, plr->cpu); - wake_up_process(thread); - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, - debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", rdtgrp->kn->name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 2a506316b303..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESCTRL_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -TRACE_EVENT(mon_llc_occupancy_limbo, - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), - TP_STRUCT__entry(__field(u32, ctrl_hw_id) - __field(u32, mon_hw_id) - __field(int, domain_id) - __field(u64, llc_occupancy_bytes)), - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; - __entry->mon_hw_id = mon_hw_id; - __entry->domain_id = domain_id; - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, - __entry->llc_occupancy_bytes) - ); - -#endif /* _TRACE_RESCTRL_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d906a1cd8491..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -28,324 +29,28 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_ctrl_domain *dom; - struct rdt_resource *r; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} - -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->hdr.cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ -static void update_cpu_closid_rmid(void *info) +void resctrl_arch_sync_cpu_closid_rmid(void *info) { - struct rdtgroup *r = info; + struct resctrl_cpu_defaults *r = info; if (r) { this_cpu_write(pqr_state.default_closid, r->closid); - this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + this_cpu_write(pqr_state.default_rmid, r->rmid); } /* @@ -353,1201 +58,9 @@ static void update_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdtg->kn->name); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", crg->kn->name); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - - seq_printf(seq, "%u\n", s->num_closid); - return 0; + resctrl_arch_sched_in(current); } -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->default_ctrl); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_ctrl_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->hdr.id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) - seq_puts(seq, "per-thread\n"); - else - seq_puts(seq, "max\n"); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct rdt_ctrl_domain *d; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (!strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int size = 0; - struct cacheinfo *ci; - int num_b; - - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) - return size; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); - if (ci) - size = ci->size / r->cache.cbm_len * num_b; - - return size; -} - -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdt_ctrl_domain *d; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->hdr.id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct mon_config_info { - u32 evtid; - u32 mon_config; -}; - #define INVALID_CONFIG_INDEX UINT_MAX /** @@ -1572,685 +85,48 @@ static inline unsigned int mon_event_config_index_get(u32 evtid) } } -static void mon_event_config_read(void *info) +void resctrl_arch_mon_event_config_read(void *_config_info) { - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; u64 msrval; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ - mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; -} - -static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info) -{ - smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1); + config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +void resctrl_arch_mon_event_config_write(void *_config_info) { - struct mon_config_info mon_info; - struct rdt_mon_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct mon_config_info)); - mon_info.evtid = evtid; - mondata_config_read(dom, &mon_info); - - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; -} - -static void mon_event_config_write(void *info) -{ - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); - return; - } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); -} - -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) -{ - struct mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.evtid = evtid; - mondata_config_read(d, &mon_info); - if (mon_info.mon_config == val) + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_mon_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & hw_res->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - hw_res->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->hdr.id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = of->kn->parent->priv; - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = of->kn->parent->priv; - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -void __init thread_throttle_mode_init(void) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); - if (!rft) - return; - - rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; -} - -void __init mbm_config_rftype_init(const char *config) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = r->fflags | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; + wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } static void l3_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } static void l2_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); -} - -static inline bool is_mba_linear(void) -{ - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear; + wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } static int set_cache_qos_cfg(int level, bool enable) @@ -2308,70 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_ctrl_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale, - * and the MBM monitor scope is the same as MBA - * control scope. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - - return (is_mbm_local_enabled() && - r->alloc_capable && is_mba_linear() && - r->ctrl_scope == rmbm->mon_scope); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_ctrl_domain *d; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2412,407 +224,12 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || kn->parent == kn_info) - return NULL; - else - return kn->priv; - } else { - return kn->parent->priv; - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } + return rdt_resources_all[l].cdp_enabled; } -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (is_mbm_enabled()) { - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - list_for_each_entry(dom, &r->mon_domains, hdr.list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - const char *msg; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope"; - if (!supports_mba_mbps()) - return invalfc(fc, msg); - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); -} - -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; -} - -static int reset_all_ctrls(struct rdt_resource *r) +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_ctrl_domain *hw_dom; @@ -2836,1398 +253,10 @@ static int reset_all_ctrls(struct rdt_resource *r) hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = r->default_ctrl; + hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); msr_param.dom = d; smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - return 0; -} - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /*Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - reset_all_ctrls(r); - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get(pkn, name); - if (!kn) - return; - kernfs_put(kn); - - if (kn->dir.subdirs <= 1) - kernfs_remove(kn); - else - kernfs_remove_by_name(kn, subname); -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct rdtgroup *prgrp, *crgrp; - char subname[32]; - bool snc_mode; - char name[32]; - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); - } -} - -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) -{ - struct rmid_read rr = {0}; - union mon_data_bits priv; - struct mon_evt *mevt; - int ret; - - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - priv.u.rid = r->rid; - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; - priv.u.sum = do_sum; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - return ret; - - if (!do_sum && is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); - } - - return 0; -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - struct kernfs_node *kn, *ckn; - char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - kn = kernfs_find_and_get(parent_kn, name); - if (kn) { - /* - * rdtgroup_mutex will prevent this directory from being - * removed. No need to keep this hold. - */ - kernfs_put(kn); - } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; - } - - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; - - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } - - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_mon_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; - } - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_ctrl_domain *d; - int ret; - - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_ctrl_domain *d; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = r->default_ctrl; - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(kn->name, "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* - * If RDT monitoring is supported and the parent directory is a valid - * "mon_groups" directory, add a monitoring subdirectory. - */ - if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* Update per cpu rmid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) { - per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; - per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; - } - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn = kn->parent; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - if (rdtgrp->type != RDTMON_GROUP || !kn->parent || - !is_mon_groups(kn->parent, kn->name)) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_mon_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - mutex_unlock(&rdtgroup_mutex); -} - -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); - - if (is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - - mutex_unlock(&rdtgroup_mutex); -} - -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - } - - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int err; - - mutex_lock(&rdtgroup_mutex); - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_mon_domain *d; - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_mon_domain_from_cpu(cpu, l3); - if (d) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * rdtgroup_init - rdtgroup initialization - * - * Setup resctrl file system including set up root, create mount point, - * register rdtgroup filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init rdtgroup_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) - return ret; - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - return ret; -} - -void __exit rdtgroup_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); + return; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 16f3ca30626a..42c7eac0c387 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,8 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_MSR_IMM, CPUID_ECX, 5, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, @@ -41,19 +43,29 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EUPDATESVN, CPUID_EAX, 10, 0x00000012, 0 }, { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_OVERFLOW_RECOV, CPUID_EBX, 0, 0x80000007, 0 }, + { X86_FEATURE_SUCCOR, CPUID_EBX, 1, 0x80000007, 0 }, + { X86_FEATURE_SMCA, CPUID_EBX, 3, 0x80000007, 0 }, { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_X2AVIC_EXT, CPUID_ECX, 6, 0x8000000a, 0 }, + { X86_FEATURE_COHERENCY_SFW_NO, CPUID_EBX, 31, 0x8000001f, 0 }, { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, + { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 }, + { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, + { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, - { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, + { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 22b65a5f5ec6..a42c7180900b 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -14,7 +14,7 @@ u64 sgx_attributes_reserved_mask; u64 sgx_xfrm_reserved_mask = ~0x3; u32 sgx_misc_reserved_mask; -static int sgx_open(struct inode *inode, struct file *file) +static int __sgx_open(struct inode *inode, struct file *file) { struct sgx_encl *encl; int ret; @@ -41,6 +41,23 @@ static int sgx_open(struct inode *inode, struct file *file) return 0; } +static int sgx_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static int sgx_release(struct inode *inode, struct file *file) { struct sgx_encl *encl = file->private_data; @@ -113,7 +130,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT @@ -150,13 +167,15 @@ int __init sgx_drv_init(void) u64 xfrm_mask; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; + } cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); if (!(eax & 1)) { - pr_err("SGX disabled: SGX1 instruction support not available.\n"); + pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; } @@ -173,8 +192,10 @@ int __init sgx_drv_init(void) } ret = misc_register(&sgx_dev_enclave); - if (ret) + if (ret) { + pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret); return ret; + } return 0; } diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 279148e72459..cf149b9f4916 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -279,7 +279,7 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl, static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, unsigned long addr, - unsigned long vm_flags) + vm_flags_t vm_flags) { unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *entry; @@ -520,9 +520,9 @@ static void sgx_vma_open(struct vm_area_struct *vma) * Return: 0 on success, -EACCES otherwise */ int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, - unsigned long end, unsigned long vm_flags) + unsigned long end, vm_flags_t vm_flags) { - unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; + vm_flags_t vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *page; unsigned long count = 0; int ret = 0; @@ -605,7 +605,7 @@ static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *pag */ static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl, unsigned long addr, - unsigned long vm_flags) + vm_flags_t vm_flags) { struct sgx_encl_page *entry; @@ -765,6 +765,7 @@ void sgx_encl_release(struct kref *ref) WARN_ON_ONCE(encl->secs.epc_page); kfree(encl); + sgx_dec_usage_count(); } /* diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index f94ff14c9486..8ff47f6652b9 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -101,7 +101,7 @@ static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr, } int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, - unsigned long end, unsigned long vm_flags); + unsigned long end, vm_flags_t vm_flags); bool current_is_ksgxd(void); void sgx_encl_release(struct kref *ref); diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index 99004b02e2ed..74be751199a4 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -68,7 +68,7 @@ static inline bool encls_failed(int ret) ({ \ int ret; \ asm volatile( \ - "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ + "1: encls\n" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret) \ @@ -111,8 +111,8 @@ static inline bool encls_failed(int ret) ({ \ int ret; \ asm volatile( \ - "1: .byte 0x0f, 0x01, 0xcf;\n\t" \ - " xor %%eax,%%eax;\n" \ + "1: encls\n\t" \ + "xor %%eax,%%eax\n" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_SGX) \ : "=a"(ret), "=b"(rbx_out) \ @@ -233,4 +233,9 @@ static inline int __eaug(struct sgx_pageinfo *pginfo, void *addr) return __encls_2(EAUG, pginfo, addr); } +/* Attempt to update CPUSVN at runtime. */ +static inline int __eupdatesvn(void) +{ + return __encls_ret_1(EUPDATESVN, ""); +} #endif /* _X86_ENCLS_H */ diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index b65ab214bdf5..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -64,6 +65,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; + /* + * ECREATE would detect this too, but checking here also ensures + * that the 'encl_size' calculations below can never overflow. + */ + if (!is_power_of_2(secs->size)) + return -EINVAL; + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); @@ -456,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -516,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8ce352fc72ac..dc73194416ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -5,6 +5,7 @@ #include <linux/freezer.h> #include <linux/highmem.h> #include <linux/kthread.h> +#include <linux/kvm_types.h> #include <linux/miscdevice.h> #include <linux/node.h> #include <linux/pagemap.h> @@ -14,7 +15,9 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/vmalloc.h> +#include <asm/msr.h> #include <asm/sgx.h> +#include <asm/archrandom.h> #include "driver.h" #include "encl.h" #include "encls.h" @@ -719,6 +722,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until @@ -871,7 +876,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); + wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { @@ -912,7 +917,107 @@ int sgx_set_attribute(unsigned long *allowed_attributes, *allowed_attributes |= SGX_ATTR_PROVISIONKEY; return 0; } -EXPORT_SYMBOL_GPL(sgx_set_attribute); +EXPORT_SYMBOL_FOR_KVM(sgx_set_attribute); + +/* Counter to count the active SGX users */ +static int sgx_usage_count; + +/** + * sgx_update_svn() - Attempt to call ENCLS[EUPDATESVN]. + * + * This instruction attempts to update CPUSVN to the + * currently loaded microcode update SVN and generate new + * cryptographic assets. + * + * Return: + * * %0: - Success or not supported + * * %-EAGAIN: - Can be safely retried, failure is due to lack of + * * entropy in RNG + * * %-EIO: - Unexpected error, retries are not advisable + */ +static int sgx_update_svn(void) +{ + int ret; + + /* + * If EUPDATESVN is not available, it is ok to + * silently skip it to comply with legacy behavior. + */ + if (!cpu_feature_enabled(X86_FEATURE_SGX_EUPDATESVN)) + return 0; + + /* + * EPC is guaranteed to be empty when there are no users. + * Ensure we are on our first user before proceeding further. + */ + WARN(sgx_usage_count, "Elevated usage count when calling EUPDATESVN\n"); + + for (int i = 0; i < RDRAND_RETRY_LOOPS; i++) { + ret = __eupdatesvn(); + + /* Stop on success or unexpected errors: */ + if (ret != SGX_INSUFFICIENT_ENTROPY) + break; + } + + switch (ret) { + case 0: + /* + * SVN successfully updated. + * Let users know when the update was successful. + */ + pr_info("SVN updated successfully\n"); + return 0; + case SGX_NO_UPDATE: + /* + * SVN update failed since the current SVN is + * not newer than CPUSVN. This is the most + * common case and indicates no harm. + */ + return 0; + case SGX_INSUFFICIENT_ENTROPY: + /* + * SVN update failed due to lack of entropy in DRNG. + * Indicate to userspace that it should retry. + */ + return -EAGAIN; + default: + break; + } + + /* + * EUPDATESVN was called when EPC is empty, all other error + * codes are unexpected. + */ + ENCLS_WARN(ret, "EUPDATESVN"); + return -EIO; +} + +/* Mutex to ensure no concurrent EPC accesses during EUPDATESVN */ +static DEFINE_MUTEX(sgx_svn_lock); + +int sgx_inc_usage_count(void) +{ + int ret; + + guard(mutex)(&sgx_svn_lock); + + if (!sgx_usage_count) { + ret = sgx_update_svn(); + if (ret) + return ret; + } + + sgx_usage_count++; + + return 0; +} + +void sgx_dec_usage_count(void) +{ + guard(mutex)(&sgx_svn_lock); + sgx_usage_count--; +} static int __init sgx_init(void) { diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index d2dad21259a8..f5940393d9bd 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -102,6 +102,9 @@ static inline int __init sgx_vepc_init(void) } #endif +int sgx_inc_usage_count(void); +void sgx_dec_usage_count(void); + void sgx_update_lepubkeyhash(u64 *lepubkeyhash); #endif /* _X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 7aaa3652e31d..8de1f1a755f2 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -5,6 +5,7 @@ * Copyright(c) 2021 Intel Corporation. */ +#include <linux/kvm_types.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/mman.h> @@ -255,10 +256,11 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) xa_destroy(&vepc->page_array); kfree(vepc); + sgx_dec_usage_count(); return 0; } -static int sgx_vepc_open(struct inode *inode, struct file *file) +static int __sgx_vepc_open(struct inode *inode, struct file *file) { struct sgx_vepc *vepc; @@ -273,6 +275,23 @@ static int sgx_vepc_open(struct inode *inode, struct file *file) return 0; } +static int sgx_vepc_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = sgx_inc_usage_count(); + if (ret) + return ret; + + ret = __sgx_vepc_open(inode, file); + if (ret) { + sgx_dec_usage_count(); + return ret; + } + + return 0; +} + static long sgx_vepc_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -363,7 +382,7 @@ int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, WARN_ON_ONCE(ret); return 0; } -EXPORT_SYMBOL_GPL(sgx_virt_ecreate); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_ecreate); static int __sgx_virt_einit(void __user *sigstruct, void __user *token, void __user *secs) @@ -432,4 +451,4 @@ int sgx_virt_einit(void __user *sigstruct, void __user *token, return ret; } -EXPORT_SYMBOL_GPL(sgx_virt_einit); +EXPORT_SYMBOL_FOR_KVM(sgx_virt_einit); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 621a151ccf7d..f55ea3cdbf88 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,6 +30,7 @@ #include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> +#include <asm/msr.h> #include <asm/smp.h> #include "cpu.h" @@ -74,15 +75,11 @@ bool arch_match_cpu_phys_id(int cpu, u64 phys_id) return phys_id == (u64)cpuid_to_apicid[cpu]; } -#ifdef CONFIG_SMP static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { if (!(apicid & (__max_threads_per_core - 1))) cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif /* * Convert the APIC ID to a domain level ID by masking out the low bits @@ -154,7 +151,7 @@ static __init bool check_for_real_bsp(u32 apic_id) * kernel must rely on the firmware enumeration order. */ if (has_apic_base) { - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); } @@ -371,6 +368,19 @@ unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_uni return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); } +#ifdef CONFIG_SMP +int topology_get_primary_thread(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + /* + * Get the core domain level APIC id, which is the primary thread + * and return the CPU number assigned to it. + */ + return topo_lookup_cpuid(topo_apicid(apic_id, TOPO_CORE_DOMAIN)); +} +#endif + #ifdef CONFIG_ACPI_HOTPLUG_CPU /** * topology_hotplug_apic - Handle a physical hotplugged APIC after boot @@ -428,8 +438,8 @@ void __init topology_apply_cmdline_limits_early(void) { unsigned int possible = nr_cpu_ids; - /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ - if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) + /* 'maxcpus=0' 'nosmp' 'nolapic' */ + if (!setup_max_cpus || apic_is_disabled) possible = 1; /* 'possible_cpus=N' */ @@ -443,7 +453,7 @@ void __init topology_apply_cmdline_limits_early(void) static __init bool restrict_to_up(void) { - if (!smp_found_config || ioapic_is_disabled) + if (!smp_found_config) return true; /* * XEN PV is special as it does not advertise the local APIC diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 03b3c9c3a45e..6ac097e13106 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -3,6 +3,7 @@ #include <asm/apic.h> #include <asm/memtype.h> +#include <asm/msr.h> #include <asm/processor.h> #include "cpu.h" @@ -58,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -80,20 +81,25 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) cpuid_leaf(0x8000001e, &leaf); - tscan->c->topo.initial_apicid = leaf.ext_apic_id; - /* - * If leaf 0xb is available, then the domain shifts are set - * already and nothing to do here. Only valid for family >= 0x17. + * If leaf 0xb/0x26 is available, then the APIC ID and the domain + * shifts are set already. */ - if (!has_topoext && tscan->c->x86 >= 0x17) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + /* - * Leaf 0x80000008 set the CORE domain shift already. - * Update the SMT domain, but do not propagate it. + * Leaf 0x8000008 sets the CORE domain shift but not the + * SMT domain shift. On CPUs with family >= 0x17, there + * might be hyperthreads. */ - unsigned int nthreads = leaf.core_nthreads + 1; + if (tscan->c->x86 >= 0x17) { + /* Update the SMT domain, but do not propagate it. */ + unsigned int nthreads = leaf.core_nthreads + 1; - topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, + get_count_order(nthreads), nthreads); + } } store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); @@ -133,7 +139,7 @@ static void parse_fam10h_node_id(struct topo_scan *tscan) if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) return; - rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + rdmsrq(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; } @@ -157,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrl(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } @@ -169,27 +176,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { - bool has_topoext = false; + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); /* - * If the extended topology leaf 0x8000_001e is available - * try to get SMT, CORE, TILE, and DIE shifts from extended + * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to - * get SMT and CORE shift from leaf 0xb first, then try to - * get the CORE shift from leaf 0x8000_0008. + * get SMT and CORE shift from leaf 0xb. If either leaf is + * available, cpu_parse_topology_ext() will return true. + * + * If XTOPOLOGY leaves (0x26/0xb) are not available, try to + * get the CORE shift from leaf 0x8000_0008 first. */ - if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_topoext = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - if (!has_topoext && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; - /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_topoext)) + /* + * Prefer leaf 0x8000001e if available to get the SMT shift and + * the initial APIC ID if XTOPOLOGY leaves are not available. + */ + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 8277c64f88db..71625795d711 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -16,6 +16,9 @@ EXPORT_SYMBOL_GPL(x86_topo_system); unsigned int __amd_nodes_per_pkg __ro_after_init; EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); +/* CPUs which are the primary SMT threads */ +struct cpumask __cpu_primary_thread_mask __read_mostly; + void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, unsigned int shift, unsigned int ncpus) { @@ -185,6 +188,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) if (!early) { c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); } /* Package relative core ID */ diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index b31ee4f1657a..209b5a22d880 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -12,19 +12,30 @@ #include <asm/cmdline.h> #include <asm/cpu.h> +#include <asm/msr.h> #include "cpu.h" #undef pr_fmt #define pr_fmt(fmt) "tsx: " fmt -enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; +enum tsx_ctrl_states { + TSX_CTRL_AUTO, + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_RTM_ALWAYS_ABORT, + TSX_CTRL_NOT_SUPPORTED, +}; + +static enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO) ? TSX_CTRL_AUTO : + IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF) ? TSX_CTRL_DISABLE : TSX_CTRL_ENABLE; static void tsx_disable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Force all transactions to immediately abort */ tsx |= TSX_CTRL_RTM_DISABLE; @@ -37,14 +48,14 @@ static void tsx_disable(void) */ tsx |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static void tsx_enable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Enable the RTM feature in the cpu */ tsx &= ~TSX_CTRL_RTM_DISABLE; @@ -56,7 +67,7 @@ static void tsx_enable(void) */ tsx &= ~TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) @@ -115,13 +126,13 @@ static void tsx_clear_cpuid(void) */ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { - rdmsrl(MSR_TSX_FORCE_ABORT, msr); + rdmsrq(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; - wrmsrl(MSR_TSX_FORCE_ABORT, msr); + wrmsrq(MSR_TSX_FORCE_ABORT, msr); } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { - rdmsrl(MSR_IA32_TSX_CTRL, msr); + rdmsrq(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, msr); + wrmsrq(MSR_IA32_TSX_CTRL, msr); } } @@ -146,20 +157,37 @@ static void tsx_dev_mode_disable(void) !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); if (mcu_opt_ctrl & RTM_ALLOW) { mcu_opt_ctrl &= ~RTM_ALLOW; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } -void __init tsx_init(void) +static int __init tsx_parse_cmdline(char *str) { - char arg[5] = {}; - int ret; + if (!str) + return -EINVAL; + + if (!strcmp(str, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(str, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(str, "auto")) { + tsx_ctrl_state = TSX_CTRL_AUTO; + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("invalid option, defaulting to off\n"); + } + + return 0; +} +early_param("tsx", tsx_parse_cmdline); +void __init tsx_init(void) +{ tsx_dev_mode_disable(); /* @@ -193,27 +221,8 @@ void __init tsx_init(void) return; } - ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); - if (ret >= 0) { - if (!strcmp(arg, "on")) { - tsx_ctrl_state = TSX_CTRL_ENABLE; - } else if (!strcmp(arg, "off")) { - tsx_ctrl_state = TSX_CTRL_DISABLE; - } else if (!strcmp(arg, "auto")) { - tsx_ctrl_state = x86_get_tsx_auto_mode(); - } else { - tsx_ctrl_state = TSX_CTRL_DISABLE; - pr_err("invalid option, defaulting to off\n"); - } - } else { - /* tsx= not provided */ - if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) - tsx_ctrl_state = x86_get_tsx_auto_mode(); - else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) - tsx_ctrl_state = TSX_CTRL_DISABLE; - else - tsx_ctrl_state = TSX_CTRL_ENABLE; - } + if (tsx_ctrl_state == TSX_CTRL_AUTO) + tsx_ctrl_state = x86_get_tsx_auto_mode(); if (tsx_ctrl_state == TSX_CTRL_DISABLE) { tsx_disable(); diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 2293efd6ffa6..e4a31c536642 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock); static void umwait_update_control_msr(void * unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached)); } /* @@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu) * the original control MSR value in umwait_init(). So there * is no race condition here. */ - wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); return 0; } @@ -86,15 +86,19 @@ static int umwait_cpu_offline(unsigned int cpu) * trust the firmware nor does it matter if the same value is written * again. */ -static void umwait_syscore_resume(void) +static void umwait_syscore_resume(void *data) { umwait_update_control_msr(NULL); } -static struct syscore_ops umwait_syscore_ops = { +static const struct syscore_ops umwait_syscore_ops = { .resume = umwait_syscore_resume, }; +static struct syscore umwait_syscore = { + .ops = &umwait_syscore_ops, +}; + /* sysfs interface */ /* @@ -214,7 +218,7 @@ static int __init umwait_init(void) * changed. This is the only place where orig_umwait_control_cached * is modified. */ - rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, umwait_cpu_offline); @@ -226,7 +230,7 @@ static int __init umwait_init(void) return ret; } - register_syscore_ops(&umwait_syscore_ops); + register_syscore(&umwait_syscore); /* * Add umwait control interface. Ignore failure, so at least the diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 00189cdeb775..cb3f900c46fc 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/clocksource.h> #include <linux/cpu.h> +#include <linux/efi.h> #include <linux/reboot.h> #include <linux/static_call.h> #include <asm/div64.h> @@ -429,6 +430,9 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT)) + x86_init.mpparse.find_mptable = mpparse_find_mptable; + vmware_paravirt_ops_setup(); #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 90eba7eb5335..89b1c8a70fe8 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> +#include <asm/msr.h> #include "cpu.h" |
