diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
32 files changed, 2334 insertions, 1060 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93eabf544031..eb4dbcdf41f1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -obj-y := cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o +obj-y += topology_common.o topology_ext.o topology_amd.o obj-y += common.o obj-y += rdrand.o obj-y += match.o @@ -25,14 +26,16 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += capflags.o powerflags.o -obj-$(CONFIG_PROC_FS) += proc.o -obj-y += capflags.o powerflags.o +obj-$(CONFIG_X86_LOCAL_APIC) += topology.o -obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +obj-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o tsx.o -obj-$(CONFIG_PM) += intel_epb.o +obj-y += intel.o intel_pconfig.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad63f..2c5b51aad91a 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9f42d1c59e09..3282a747b645 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/sev.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -27,13 +28,6 @@ #include "cpu.h" -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX - * Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -300,97 +294,6 @@ static int nearby_node(int apicid) } #endif -/* - * Fix up topo::core_id for pre-F17h systems to be in the - * [0 .. cores_per_node - 1] range. Not really needed but - * kept so as not to break existing setups. - */ -static void legacy_fixup_core_id(struct cpuinfo_x86 *c) -{ - u32 cus_per_node; - - if (c->x86 >= 0x17) - return; - - cus_per_node = c->x86_max_cores / nodes_per_socket; - c->topo.core_id %= cus_per_node; -} - -/* - * Fixup core topology information for - * (1) AMD multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) AMD processors supporting compute units - */ -static void amd_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - if (c->x86 == 0x15) - c->topo.cu_id = ebx & 0xff; - - if (c->x86 >= 0x17) { - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - } - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - cacheinfo_amd_init_llc_id(c); - - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) { - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - legacy_fixup_core_id(c); - } -} - -/* - * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void amd_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* use socket ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - -u32 amd_get_nodes_per_socket(void) -{ - return nodes_per_socket; -} -EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -442,32 +345,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -500,18 +377,6 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && c->x86 >= 0x15 && c->x86 <= 0x17) { @@ -538,7 +403,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) /* Figure out Zen generations: */ switch (c->x86) { - case 0x17: { + case 0x17: switch (c->x86_model) { case 0x00 ... 0x2f: case 0x50 ... 0x5f: @@ -554,8 +419,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) goto warn; } break; - } - case 0x19: { + + case 0x19: switch (c->x86_model) { case 0x00 ... 0x0f: case 0x20 ... 0x5f: @@ -569,11 +434,39 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) goto warn; } break; - } + + case 0x1a: + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x2f: + case 0x40 ... 0x4f: + case 0x70 ... 0x7f: + setup_force_cpu_cap(X86_FEATURE_ZEN5); + break; + default: + goto warn; + } + break; + default: break; } + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and it can vary by processor + * and is defined by the per-processor PPR. Restrict SNP support on the + * known CPU model and family for which the RMP table entry format is + * currently defined for. + */ + if (!boot_cpu_has(X86_FEATURE_ZEN3) && + !boot_cpu_has(X86_FEATURE_ZEN4) && + !boot_cpu_has(X86_FEATURE_ZEN5)) + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + else if (!snp_probe_rmptable_info()) + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + } + return; warn: @@ -592,8 +485,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * SME feature (set in scattered.c). * If the kernel has not enabled SME via any means then * don't advertise the SME feature. - * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV and SEV_ES feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise SEV and + * any additional functionality based on it. * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -628,6 +521,7 @@ clear_all: clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); } } @@ -636,8 +530,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) u64 value; u32 dummy; - early_init_amd_mc(c); - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -717,9 +609,6 @@ static void early_init_amd(struct cpuinfo_x86 *c) } } - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); @@ -928,7 +817,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) void init_spectral_chicken(struct cpuinfo_x86 *c) { -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY u64 value; /* @@ -956,7 +845,6 @@ static void init_amd_zen_common(void) static void init_amd_zen1(struct cpuinfo_x86 *c) { - init_amd_zen_common(); fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ @@ -1010,7 +898,6 @@ static void zen2_zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen2(struct cpuinfo_x86 *c) { - init_amd_zen_common(); init_spectral_chicken(c); fix_erratum_1386(c); zen2_zenbleed_check(c); @@ -1018,8 +905,6 @@ static void init_amd_zen2(struct cpuinfo_x86 *c) static void init_amd_zen3(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { /* * Zen3 (Fam19 model < 0x10) parts are not susceptible to @@ -1033,12 +918,14 @@ static void init_amd_zen3(struct cpuinfo_x86 *c) static void init_amd_zen4(struct cpuinfo_x86 *c) { - init_amd_zen_common(); - if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); } +static void init_amd_zen5(struct cpuinfo_x86 *c) +{ +} + static void init_amd(struct cpuinfo_x86 *c) { u64 vm_cr; @@ -1058,9 +945,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_FSRM)) set_cpu_cap(c, X86_FEATURE_FSRS); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); @@ -1076,6 +960,13 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x16: init_amd_jg(c); break; } + /* + * Save up on some future enablement work and do common Zen + * settings. + */ + if (c->x86 >= 0x17) + init_amd_zen_common(); + if (boot_cpu_has(X86_FEATURE_ZEN1)) init_amd_zen1(c); else if (boot_cpu_has(X86_FEATURE_ZEN2)) @@ -1084,6 +975,8 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_zen3(c); else if (boot_cpu_has(X86_FEATURE_ZEN4)) init_amd_zen4(c); + else if (boot_cpu_has(X86_FEATURE_ZEN5)) + init_amd_zen5(c); /* * Enable workaround for FXSAVE leak on CPUs @@ -1094,8 +987,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - amd_detect_cmp(c); - amd_get_topology(c); srat_detect_node(c); init_amd_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b91..e7ba936d798b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -56,7 +56,7 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); -EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); +EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); @@ -111,9 +111,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -252,7 +249,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -356,7 +353,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -424,7 +421,14 @@ static void __init mmio_select_mitigation(void) */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + /* + * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based + * mitigations, disable KVM-only mitigation in that case. + */ + if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) + static_branch_disable(&mmio_stale_data_clear); else static_branch_enable(&mmio_stale_data_clear); @@ -477,6 +481,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str) early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "Register File Data Sampling: " fmt + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; + +static const char * const rfds_strings[] = { + [RFDS_MITIGATION_OFF] = "Vulnerable", + [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", + [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", +}; + +static void __init rfds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { + rfds_mitigation = RFDS_MITIGATION_OFF; + return; + } + if (rfds_mitigation == RFDS_MITIGATION_OFF) + return; + + if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + else + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; +} + +static __init int rfds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + return 0; + + if (!strcmp(str, "off")) + rfds_mitigation = RFDS_MITIGATION_OFF; + else if (!strcmp(str, "on")) + rfds_mitigation = RFDS_MITIGATION_VERW; + + return 0; +} +early_param("reg_file_data_sampling", rfds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "" fmt static void __init md_clear_update_mitigation(void) @@ -484,12 +539,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { @@ -501,11 +556,19 @@ static void __init md_clear_update_mitigation(void) taa_mitigation = TAA_MITIGATION_VERW; taa_select_mitigation(); } - if (mmio_mitigation == MMIO_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + /* + * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear + * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. + */ + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_select_mitigation(); } + if (rfds_mitigation == RFDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_RFDS)) { + rfds_mitigation = RFDS_MITIGATION_VERW; + rfds_select_mitigation(); + } out: if (boot_cpu_has_bug(X86_BUG_MDS)) pr_info("MDS: %s\n", mds_strings[mds_mitigation]); @@ -515,6 +578,8 @@ out: pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) pr_info("MMIO Stale Data: Unknown: No mitigations\n"); + if (boot_cpu_has_bug(X86_BUG_RFDS)) + pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); } static void __init md_clear_select_mitigation(void) @@ -522,11 +587,12 @@ static void __init md_clear_select_mitigation(void) mds_select_mitigation(); taa_select_mitigation(); mmio_select_mitigation(); + rfds_select_mitigation(); /* - * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update - * and print their mitigation after MDS, TAA and MMIO Stale Data - * mitigation selection is done. + * As these mitigations are inter-related and rely on VERW instruction + * to clear the microarchitural buffers, update and print their status + * after mitigation selection is done for each of these vulnerabilities. */ md_clear_update_mitigation(); } @@ -671,7 +737,7 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) +#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; #else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; @@ -982,10 +1048,10 @@ static void __init retbleed_select_mitigation(void) return; case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { - pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; @@ -994,24 +1060,24 @@ static void __init retbleed_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else - pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } @@ -1021,9 +1087,10 @@ do_cmd_auto: case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } @@ -1102,7 +1169,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) @@ -1415,7 +1482,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_RETPOLINE)) { + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1438,7 +1505,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1469,7 +1536,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { - if (!IS_ENABLED(CONFIG_RETPOLINE)) { + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } @@ -1564,7 +1631,7 @@ static void __init spectre_v2_select_mitigation(void) break; } - if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && @@ -2457,7 +2524,7 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. @@ -2477,29 +2544,29 @@ static void __init srso_select_mitigation(void) else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; } } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); } break; case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; } } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; } @@ -2615,6 +2682,11 @@ static ssize_t mmio_stale_data_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t rfds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2774,6 +2846,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_GDS: return gds_show_state(buf); + case X86_BUG_RFDS: + return rfds_show_state(buf); + default: break; } @@ -2848,4 +2923,14 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_GDS); } + +ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); +} #endif + +void __warn_thunk(void) +{ + WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n"); +} diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c131c412db89..392d09c936d6 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -301,7 +301,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + eax->split.num_cores_on_die = topology_num_cores_per_package(); if (assoc == 0xffff) @@ -595,7 +595,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = topology_die_id(smp_processor_id()); + node = topology_amd_node_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -661,7 +661,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,7 +672,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) if (c->x86 < 0x17) { /* LLC is at the node level. */ - c->topo.llc_id = c->topo.die_id; + c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -1118,15 +1118,16 @@ static void cache_cpu_init(void) unsigned long flags; local_irq_save(flags); - cache_disable(); - if (memory_caching_control & CACHE_MTRR) + if (memory_caching_control & CACHE_MTRR) { + cache_disable(); mtrr_generic_set_state(); + cache_enable(); + } if (memory_caching_control & CACHE_PAT) pat_cpu_init(); - cache_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 345f7d905db6..a3b55db35c96 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif early_init_centaur(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0b97bcde70c6..ba8cf5e9ce56 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,6 +61,7 @@ #include <asm/microcode.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> +#include <asm/fred.h> #include <asm/uv/uv.h> #include <asm/ia32.h> #include <asm/set_memory.h> @@ -70,11 +71,26 @@ #include "cpu.h" +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); +unsigned int __max_threads_per_core __ro_after_init = 1; +EXPORT_SYMBOL(__max_threads_per_core); + +unsigned int __max_dies_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__max_dies_per_package); + +unsigned int __max_logical_packages __ro_after_init = 1; +EXPORT_SYMBOL(__max_logical_packages); + +unsigned int __num_cores_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_cores_per_package); + +unsigned int __num_threads_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_threads_per_package); static struct ppin_info { int feature; @@ -382,9 +398,8 @@ out: } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -790,19 +805,6 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } -void detect_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - c->x86_max_cores = 1; - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return; - - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - c->x86_max_cores = (eax >> 26) + 1; -} - void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -864,51 +866,6 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -int detect_ht_early(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - - if (!cpu_has(c, X86_FEATURE_HT)) - return -1; - - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return -1; - - if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return -1; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - smp_num_siblings = (ebx & 0xff0000) >> 16; - if (smp_num_siblings == 1) - pr_info_once("CPU0: Hyper-Threading is disabled\n"); -#endif - return 0; -} - -void detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - int index_msb, core_bits; - - if (detect_ht_early(c) < 0) - return; - - index_msb = get_count_order(smp_num_siblings); - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif -} - static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; @@ -1267,6 +1224,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define SRSO BIT(5) /* CPU is affected by GDS */ #define GDS BIT(6) +/* CPU is affected by Register File Data Sampling */ +#define RFDS BIT(7) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1294,9 +1253,18 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), @@ -1330,6 +1298,24 @@ static bool arch_cap_mmio_immune(u64 ia32_cap) ia32_cap & ARCH_CAP_SBDR_SSDP_NO); } +static bool __init vulnerable_to_rfds(u64 ia32_cap) +{ + /* The "immunity" bit trumps everything else: */ + if (ia32_cap & ARCH_CAP_RFDS_NO) + return false; + + /* + * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to + * indicate that mitigation is needed because guest is running on a + * vulnerable hardware or may migrate to such hardware: + */ + if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + return true; + + /* Only consult the blacklist when there is no enumeration: */ + return cpu_matches(cpu_vuln_blacklist, RFDS); +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); @@ -1355,8 +1341,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) /* * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature * flag and protect from vendor-specific bugs via the whitelist. + * + * Don't use AutoIBRS when SNP is enabled because it degrades host + * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { + if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + (cpu_has(c, X86_FEATURE_AUTOIBRS) && + !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && !(ia32_cap & ARCH_CAP_PBRSB_NO)) @@ -1441,6 +1432,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); + if (vulnerable_to_rfds(ia32_cap)) + setup_force_cpu_bug(X86_BUG_RFDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1589,8 +1583,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_vendor(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); cpu_parse_early_param(); + cpu_init_topology(c); + if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1601,10 +1598,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_bsp_init(c); } else { setup_clear_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); + cpu_init_topology(c); } - get_cpu_address_sizes(c); - setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1748,18 +1745,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_cpu_address_sizes(c); - if (c->cpuid_level >= 0x00000001) { - c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_32 -# ifdef CONFIG_SMP - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -# else - c->topo.apicid = c->topo.initial_apicid; -# endif -#endif - c->topo.pkg_id = c->topo.initial_apicid; - } - get_model_name(c); /* Default name */ /* @@ -1781,29 +1766,6 @@ static void generic_identify(struct cpuinfo_x86 *c) } /* - * Validate that ACPI/mptables have the same information about the - * effective APIC id and update the package map. - */ -static void validate_apic_and_package_id(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int cpu = smp_processor_id(); - u32 apicid; - - apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid != c->topo.apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->topo.initial_apicid); - } - BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); - BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); -#else - c->topo.logical_pkg_id = 0; -#endif -} - -/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1816,11 +1778,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->topo.cu_id = 0xff; - c->topo.llc_id = BAD_APICID; - c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1839,17 +1796,14 @@ static void identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); + cpu_parse_topology(c); + if (this_cpu->c_identify) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ apply_forced_caps(c); -#ifdef CONFIG_X86_64 - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -#endif - - /* * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and * Hygon will clear it in ->c_init() below. @@ -1903,10 +1857,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } -#ifdef CONFIG_X86_64 - detect_ht(c); -#endif - x86_init_rdrand(c); setup_pku(c); setup_cet(c); @@ -1938,8 +1888,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); - select_idle_routine(c); - #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif @@ -1998,7 +1946,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); if (boot_cpu_has_bug(X86_BUG_GDS)) @@ -2050,6 +1997,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); +EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, @@ -2067,10 +2015,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { @@ -2104,6 +2050,23 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR @@ -2207,8 +2170,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2217,8 +2181,10 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + load_current_idt(); } /* @@ -2343,11 +2309,13 @@ void __init arch_cpu_finalize_init(void) { identify_boot_cpu(); + select_idle_routine(); + /* * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); + cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 885281ae79a5..ea9e07d57c8d 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -2,6 +2,11 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H +#include <asm/cpu.h> +#include <asm/topology.h> + +#include "topology.h" + /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -71,14 +76,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); -extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); -extern int detect_extended_topology_early(struct cpuinfo_x86 *c); -extern int detect_extended_topology(struct cpuinfo_x86 *c); -extern int detect_ht_early(struct cpuinfo_x86 *c); -extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); @@ -96,4 +96,5 @@ static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_EIBRS_LFENCE; } + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e462c1d3800a..b7174209d855 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -82,6 +82,8 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, {} }; diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 0c179d684b3b..3baf3e435834 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -5,6 +5,8 @@ #include <asm/apic.h> #include <asm/processor.h> +#include "cpu.h" + static int cpu_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; @@ -24,9 +26,12 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); - seq_printf(m, "max_cores: %u\n", c->x86_max_cores); - seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); - seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); + seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg()); + seq_printf(m, "num_threads: %u\n", __num_threads_per_package); + seq_printf(m, "num_cores: %u\n", __num_cores_per_package); + seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package); + seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core); return 0; } @@ -42,12 +47,48 @@ static const struct file_operations dfs_cpu_ops = { .release = single_release, }; +static int dom_debug_show(struct seq_file *m, void *p) +{ + static const char *domain_names[TOPO_MAX_DOMAIN] = { + [TOPO_SMT_DOMAIN] = "Thread", + [TOPO_CORE_DOMAIN] = "Core", + [TOPO_MODULE_DOMAIN] = "Module", + [TOPO_TILE_DOMAIN] = "Tile", + [TOPO_DIE_DOMAIN] = "Die", + [TOPO_DIEGRP_DOMAIN] = "DieGrp", + [TOPO_PKG_DOMAIN] = "Package", + }; + unsigned int dom, nthreads = 1; + + for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) { + nthreads *= x86_topo_system.dom_size[dom]; + seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n", + domain_names[dom], x86_topo_system.dom_shifts[dom], + x86_topo_system.dom_size[dom], nthreads); + } + return 0; +} + +static int dom_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, dom_debug_show, inode->i_private); +} + +static const struct file_operations dfs_dom_ops = { + .open = dom_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static __init int cpu_init_debugfs(void) { struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); unsigned long id; char name[24]; + debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops); + dir = debugfs_create_dir("cpus", base); for_each_possible_cpu(id) { sprintf(name, "%lu", id); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index f0cd95502faa..c5191b06f9f2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -18,14 +18,6 @@ #include "cpu.h" -#define APICID_SOCKET_ID_BIT 6 - -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - #ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in @@ -49,80 +41,6 @@ static int nearby_node(int apicid) } #endif -static void hygon_get_topology_early(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; -} - -/* - * Fixup core topology information for - * (1) Hygon multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) Hygon processors supporting compute units - */ -static void hygon_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - /* - * Socket ID is ApicId[6] for the processors with model <= 0x3 - * when running on host. - */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) - c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - - cacheinfo_hygon_init_llc_id(c); - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) - set_cpu_cap(c, X86_FEATURE_AMD_DCM); -} - -/* - * On Hygon setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void hygon_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned int bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* Use package ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -173,32 +91,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_hygon_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_hygon(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -212,18 +104,6 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { /* @@ -242,8 +122,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) { u32 dummy; - early_init_hygon_mc(c); - set_cpu_cap(c, X86_FEATURE_K8); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -284,8 +162,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) * we can set it unconditionally. */ set_cpu_cap(c, X86_FEATURE_VMMCALL); - - hygon_get_topology_early(c); } static void init_hygon(struct cpuinfo_x86 *c) @@ -302,9 +178,6 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* * XXX someone from Hygon needs to confirm this DTRT * @@ -316,8 +189,6 @@ static void init_hygon(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - hygon_detect_cmp(c); - hygon_get_topology(c); srat_detect_node(c); init_hygon_cacheinfo(c); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a927a8fc9624..be30d7fa2e66 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -184,6 +184,90 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ +#define TME_ACTIVATE_POLICY_AES_XTS_128 0 + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ +#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 + +/* Values for mktme_status (SW only construct) */ +#define MKTME_ENABLED 0 +#define MKTME_DISABLED 1 +#define MKTME_UNINITIALIZED 2 +static int mktme_status = MKTME_UNINITIALIZED; + +static void detect_tme_early(struct cpuinfo_x86 *c) +{ + u64 tme_activate, tme_policy, tme_crypto_algs; + int keyid_bits = 0, nr_keyids = 0; + static u64 tme_activate_cpu0 = 0; + + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (mktme_status != MKTME_UNINITIALIZED) { + if (tme_activate != tme_activate_cpu0) { + /* Broken BIOS? */ + pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); + pr_err_once("x86/tme: MKTME is not usable\n"); + mktme_status = MKTME_DISABLED; + + /* Proceed. We may need to exclude bits from x86_phys_bits. */ + } + } else { + tme_activate_cpu0 = tme_activate; + } + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + mktme_status = MKTME_DISABLED; + return; + } + + if (mktme_status != MKTME_UNINITIALIZED) + goto detect_keyid_bits; + + pr_info("x86/tme: enabled by BIOS\n"); + + tme_policy = TME_ACTIVATE_POLICY(tme_activate); + if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) + pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + + tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); + if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { + pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", + tme_crypto_algs); + mktme_status = MKTME_DISABLED; + } +detect_keyid_bits: + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + nr_keyids = (1UL << keyid_bits) - 1; + if (nr_keyids) { + pr_info_once("x86/mktme: enabled by BIOS\n"); + pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); + } else { + pr_info_once("x86/mktme: disabled by BIOS\n"); + } + + if (mktme_status == MKTME_UNINITIALIZED) { + /* MKTME is usable */ + mktme_status = MKTME_ENABLED; + } + + /* + * KeyID bits effectively lower the number of physical address + * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -317,11 +401,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) check_memory_type_self_snoop_errata(c); /* - * Get the number of SMT siblings early from the extended topology - * leaf, if available. Otherwise try the legacy SMT detection. + * Adjust the number of physical bits early because it affects the + * valid bits of the MTRR mask registers. */ - if (detect_extended_topology_early(c) < 0) - detect_ht_early(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme_early(c); } static void bsp_init_intel(struct cpuinfo_x86 *c) @@ -482,90 +566,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -#define MSR_IA32_TME_ACTIVATE 0x982 - -/* Helpers to access TME_ACTIVATE MSR */ -#define TME_ACTIVATE_LOCKED(x) (x & 0x1) -#define TME_ACTIVATE_ENABLED(x) (x & 0x2) - -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - -#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ - -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - -static void detect_tme(struct cpuinfo_x86 *c) -{ - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); - - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } - - if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { - pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; - return; - } - - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); - - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } - - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } - - /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. - */ - c->x86_phys_bits -= keyid_bits; -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -603,24 +603,6 @@ static void init_intel(struct cpuinfo_x86 *c) intel_workarounds(c); - /* - * Detect the extended topology information if available. This - * will reinitialise the initial_apicid which will be used - * in init_intel_cacheinfo() - */ - detect_extended_topology(c); - - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -702,9 +684,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_ia32_feat_ctl(c); - if (cpu_has(c, X86_FEATURE_TME)) - detect_tme(c); - init_intel_misc_features(c); split_lock_init(); diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c index 0771a905b286..5be2b1790282 100644 --- a/arch/x86/kernel/cpu/intel_pconfig.c +++ b/arch/x86/kernel/cpu/intel_pconfig.c @@ -7,6 +7,8 @@ * Author: * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> */ +#include <linux/bug.h> +#include <linux/limits.h> #include <asm/cpufeature.h> #include <asm/intel_pconfig.h> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 2b46eb0fdf3a..9a0133ef7e20 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1231,7 +1231,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, return -ENODEV; if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_die_id(cpu)); + nb = node_to_amd_nb(topology_amd_node_id(cpu)); /* threshold descriptor already initialized on this node? */ if (nb && nb->bank4) { @@ -1335,7 +1335,7 @@ static void threshold_remove_bank(struct threshold_bank *bank) * The last CPU on this node using the shared bank is going * away, remove that bank now. */ - nb = node_to_amd_nb(topology_die_id(smp_processor_id())); + nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); nb->bank4 = NULL; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index bc39252bc54f..b5cc557cfc37 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -46,6 +46,7 @@ #include <linux/hardirq.h> #include <linux/kexec.h> +#include <asm/fred.h> #include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> @@ -2166,6 +2167,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) @@ -2431,7 +2457,7 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct bus_type mce_subsys = { +static const struct bus_type mce_subsys = { .name = "machinecheck", .dev_name = "machinecheck", }; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 72f0695c3dc1..94953d749475 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -430,11 +430,9 @@ static void trigger_thr_int(void *info) static u32 get_nbc_for_node(int node_id) { - struct cpuinfo_x86 *c = &boot_cpu_data; u32 cores_per_node; - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - + cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg(); return cores_per_node * node_id; } @@ -543,8 +541,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(topology_die_id(cpu)); - cpu = get_nbc_for_node(topology_die_id(cpu)); + toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu)); + cpu = get_nbc_for_node(topology_amd_node_id(cpu)); } cpus_read_lock(); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 857e608af641..5f0414452b67 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -641,7 +641,7 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) { u64 llc_size = c->x86_cache_size * 1024ULL; - do_div(llc_size, c->x86_max_cores); + do_div(llc_size, topology_num_cores_per_package()); llc_size_per_core = (unsigned int)llc_size; } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b6..45e0e70e238c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -539,19 +539,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d3524778a545..422a4ddc2ab7 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; + if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + return; + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 26a427fa84ea..eeac00d20926 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -6,6 +6,7 @@ * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ +#include <linux/printk.h> #include <asm/processor.h> #include <asm/archrandom.h> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 19e0681f0435..83e40341583e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/cacheinfo.h> @@ -25,8 +26,15 @@ #include <asm/resctrl.h> #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -136,15 +144,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; @@ -231,9 +239,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) static bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - union cpuid_0x10_3_eax eax; - union cpuid_0x10_x_edx edx; - u32 ebx, ecx, subleaf; + u32 eax, ebx, ecx, edx, subleaf; /* * Query CPUID_Fn80000020_EDX_x01 for MBA and @@ -241,9 +247,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) */ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); - hw_res->num_closid = edx.split.cos_max + 1; - r->default_ctrl = MAX_MBA_BW_AMD; + cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); + hw_res->num_closid = edx + 1; + r->default_ctrl = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -512,6 +518,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) struct rdt_domain *d; int err; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, &add_pos); if (IS_ERR(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -545,11 +553,12 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; } - list_add_tail(&d->list, add_pos); + list_add_tail_rcu(&d->list, add_pos); err = resctrl_online_domain(r, d); if (err) { - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); domain_free(hw_dom); } } @@ -560,6 +569,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) struct rdt_hw_domain *hw_dom; struct rdt_domain *d; + lockdep_assert_held(&domain_list_lock); + d = rdt_find_domain(r, id, NULL); if (IS_ERR_OR_NULL(d)) { pr_warn("Couldn't find cache id for CPU %d\n", cpu); @@ -570,7 +581,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { resctrl_offline_domain(r, d); - list_del(&d->list); + list_del_rcu(&d->list); + synchronize_rcu(); /* * rdt_domain "d" is going to be freed below, so clear @@ -582,73 +594,47 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) return; } - - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(r, d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); - } - } } static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - state->default_closid = 0; - state->default_rmid = 0; - state->cur_closid = 0; - state->cur_rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + state->default_closid = RESCTRL_RESERVED_CLOSID; + state->default_rmid = RESCTRL_RESERVED_RMID; + state->cur_closid = RESCTRL_RESERVED_CLOSID; + state->cur_rmid = RESCTRL_RESERVED_RMID; + wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID, + RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); + resctrl_online_cpu(cpu); return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *rdtgrp; struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -968,7 +954,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; @@ -992,8 +979,14 @@ late_initcall(resctrl_late_init); static void __exit resctrl_exit(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); + + if (r->mon_capable) + rdt_put_mon_l3_config(); } __exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index beccb0e87ba7..7997b47743a2 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -19,6 +19,8 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/tick.h> + #include "internal.h" /* @@ -210,6 +212,9 @@ static int parse_line(char *line, struct resctrl_schema *s, struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -314,6 +319,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) struct rdt_domain *d; u32 idx; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -379,11 +387,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -445,7 +451,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } @@ -465,6 +470,9 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); list_for_each_entry(dom, &r->domains, list) { if (sep) @@ -522,12 +530,24 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first) { + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* - * setup the parameters to send to the IPI to read the data. + * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; @@ -535,8 +555,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, rr->d = d; rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a4f1aa15f0a2..c99f26ebe7a6 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -7,6 +7,9 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> #include <linux/jump_label.h> +#include <linux/tick.h> + +#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL @@ -18,7 +21,6 @@ #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) @@ -54,6 +56,46 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu, hk_cpu; + + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) + return cpu; + + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + + if (hk_cpu < nr_cpu_ids) + cpu = hk_cpu; + + return cpu; +} + struct rdt_fs_context { struct kernfs_fs_context kfc; bool enable_cdpl2; @@ -69,9 +111,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) return container_of(kfc, struct rdt_fs_context, kfc); } -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - /** * struct mon_evt - Entry in the event list of a resource * @evtid: event id @@ -112,12 +151,12 @@ struct rmid_read { bool first; int err; u64 val; + void *arch_mon_ctx; }; -extern bool rdt_alloc_capable; -extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; +extern bool resctrl_mounted; enum rdt_group_type { RDTCTRL_GROUP = 0, @@ -296,14 +335,10 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps - * @delta_bw: Difference between the current and previous bandwidth - * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 prev_bw_bytes; u32 prev_bw; - u32 delta_bw; - bool delta_comp; }; /** @@ -395,6 +430,8 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. + * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth + * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -409,6 +446,7 @@ struct rdt_hw_resource { struct rdt_resource *r); unsigned int mon_scale; unsigned int mbm_width; + unsigned int mbm_cfg_mask; bool cdp_enabled; }; @@ -426,8 +464,6 @@ extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - extern struct dentry *debugfs_resctrl; enum resctrl_res_level { @@ -543,9 +579,10 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); -int alloc_rmid(void); -void free_rmid(u32 rmid); +int alloc_rmid(u32 closid); +void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); @@ -553,17 +590,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, struct rdt_domain *d, struct rdtgroup *rdtgrp, int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +bool has_busy_rmid(struct rdt_domain *d); void __check_limbo(struct rdt_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void __init thread_throttle_mode_init(void); void __init mbm_config_rftype_init(const char *config); void rdt_staged_configs_clear(void); +bool closid_allocated(unsigned int closid); +int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f136ac046851..c34a35ec0f03 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,7 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#include <linux/cpu.h> #include <linux/module.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -24,7 +25,20 @@ #include "internal.h" +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -38,6 +52,13 @@ struct rmid_entry { static LIST_HEAD(rmid_free_lru); /* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that @@ -136,12 +157,29 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } @@ -190,7 +228,8 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; @@ -230,7 +269,8 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) } int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); @@ -238,6 +278,8 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, u64 msr_val, chunks; int ret; + resctrl_arch_rmid_read_context_check(); + if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; @@ -260,6 +302,17 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and @@ -269,11 +322,20 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, void __check_limbo(struct rdt_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -281,53 +343,125 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(nrmid); - - if (resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); } - crmid = nrmid + 1; + cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +bool has_busy_rmid(struct rdt_domain *d) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; } /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } @@ -335,47 +469,50 @@ static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_domain *d; - int cpu, err; - u64 val = 0; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); entry->busy = 0; - cpu = get_cpu(); list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } - /* * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) - rmid_limbo_count++; - else - list_add_tail(&entry->list, &rmid_free_lru); + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,33 +520,36 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, + &tval, rr->arch_mon_ctx); if (rr->err) return rr->err; @@ -421,6 +561,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,9 +570,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *m = &rr->d->mbm_local[idx]; u64 cur_bw, bytes, cur_bytes; cur_bytes = rr->val; @@ -440,14 +582,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) cur_bw = bytes / SZ_1M; - if (m->delta_comp) - m->delta_bw = abs(cur_bw - m->prev_bw); - m->delta_comp = false; m->prev_bw = cur_bw; } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) @@ -459,7 +598,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,7 +609,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -520,9 +660,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - u32 cur_bw, delta_bw, user_bw; struct rdt_resource *r_mba; struct rdt_domain *dom_mba; + u32 cur_bw, user_bw, idx; struct list_head *head; struct rdtgroup *entry; @@ -533,7 +673,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + idx = resctrl_arch_rmid_idx_encode(closid, rmid); + pmbm_data = &dom_mbm->mbm_local[idx]; dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { @@ -543,7 +684,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; - delta_bw = pmbm_data->delta_bw; /* MBA resource doesn't support CDP */ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); @@ -555,52 +695,35 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) list_for_each_entry(entry, head, mon.crdtgrp_list) { cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; cur_bw += cmbm_data->prev_bw; - delta_bw += cmbm_data->delta_bw; } /* * Scale up/down the bandwidth linearly for the ctrl group. The * bandwidth step is the bandwidth granularity specified by the * hardware. - * - * The delta_bw is used when increasing the bandwidth so that we - * dont alternately increase and decrease the control values - * continuously. - * - * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if - * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep - * switching between 90 and 110 continuously if we only check - * cur_bw < user_bw. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". */ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { new_msr_val = cur_msr_val - r_mba->membw.bw_gran; } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw + delta_bw))) { + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { new_msr_val = cur_msr_val + r_mba->membw.bw_gran; } else { return; } resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); - - /* - * Delta values are updated dynamically package wise for each - * rdtgrp every time the throttle MSR changes value. - * - * This is because (1)the increase in bandwidth is not perfectly - * linear and only "approximately" linear even when the hardware - * says it is linear.(2)Also since MBA is a core specific - * mechanism, the delta values vary based on number of cores used - * by the rdtgrp. - */ - pmbm_data->delta_comp = true; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cmbm_data->delta_comp = true; - } } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, + u32 closid, u32 rmid) { struct rmid_read rr; @@ -615,12 +738,28 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) if (is_mbm_total_enabled()) { rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; rr.val = 0; - __mon_event_count(rmid, &rr); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* * Call the MBA software controller only for the @@ -628,7 +767,9 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) * the software controller explicitly. */ if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); } } @@ -639,106 +780,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); - struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); struct list_head *head; struct rdt_resource *r; struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; d = container_of(work, struct rdt_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int i, nr_rmids; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } - for (i = 0; i < nr_rmids; i++) { + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); } static struct mon_evt llc_occupancy_event = { @@ -813,6 +1041,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return ret; if (rdt_cpu_has(X86_FEATURE_BMEC)) { + u32 eax, ebx, ecx, edx; + + /* Detect list of bandwidth sources that can be tracked */ + cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); + hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; mbm_config_rftype_init("mbm_total_bytes_config"); @@ -830,6 +1064,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8f559eeae08e..884b88e25141 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -581,7 +581,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); if (ret) goto err_cpus_list; @@ -628,7 +628,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; @@ -752,7 +752,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * anymore when this group would be used for pseudo-locking. This * is safe to call on platforms not capable of monitoring. */ - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); ret = 0; goto out; @@ -776,8 +776,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) { int ret; - if (rdt_mon_capable) { - ret = alloc_rmid(); + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; @@ -787,7 +787,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ret = rdtgroup_locksetup_user_restore(rdtgrp); if (ret) { - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -844,6 +844,9 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 69a1de92384a..011e17efb1a6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -35,6 +35,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -42,6 +46,9 @@ LIST_HEAD(rdt_all_groups); /* list of entries for the schemata file */ LIST_HEAD(resctrl_schema_all); +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -102,7 +109,7 @@ void rdt_staged_configs_clear(void) * * Using a global CLOSID across all resources has some advantages and * some drawbacks: - * + We can simply set "current->closid" to assign a task to a resource + * + We can simply set current's closid to assign a task to a resource * group. * + Context switch code can avoid extra memory references deciding which * CLOSID to load into the PQR_ASSOC MSR @@ -111,7 +118,7 @@ void rdt_staged_configs_clear(void) * - Our choices on how to configure each resource become progressively more * limited as the number of resources grows. */ -static int closid_free_map; +static unsigned long closid_free_map; static int closid_free_map_len; int closids_supported(void) @@ -130,26 +137,39 @@ static void closid_init(void) closid_free_map = BIT_MASK(rdt_min_closid) - 1; - /* CLOSID 0 is always reserved for the default group */ - closid_free_map &= ~1; + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) { - u32 closid = ffs(closid_free_map); + int cleanest_closid; + u32 closid; - if (closid == 0) - return -ENOSPC; - closid--; - closid_free_map &= ~(1 << closid); + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = ffs(closid_free_map); + if (closid == 0) + return -ENOSPC; + closid--; + } + __clear_bit(closid, &closid_free_map); return closid; } void closid_free(int closid) { - closid_free_map |= 1 << closid; + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, &closid_free_map); } /** @@ -159,9 +179,11 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool closid_allocated(unsigned int closid) +bool closid_allocated(unsigned int closid) { - return (closid_free_map & (1 << closid)) == 0; + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, &closid_free_map); } /** @@ -559,14 +581,26 @@ static void update_task_closid_rmid(struct task_struct *t) _update_task_closid_rmid(t); } +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { /* If the task is already in rdtgrp, no need to move the task. */ - if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && - tsk->rmid == rdtgrp->mon.rmid) || - (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && - tsk->closid == rdtgrp->mon.parent->closid)) + if (task_in_rdtgroup(tsk, rdtgrp)) return 0; /* @@ -577,19 +611,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * For monitor groups, can move the tasks only from * their parent CTRL group. */ - - if (rdtgrp->type == RDTCTRL_GROUP) { - WRITE_ONCE(tsk->closid, rdtgrp->closid); - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. @@ -611,14 +645,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); } /** @@ -853,7 +888,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, mutex_lock(&rdtgroup_mutex); /* Return empty if resctrl has not been mounted. */ - if (!static_branch_unlikely(&rdt_enable_key)) { + if (!resctrl_mounted) { seq_puts(s, "res:\nmon:\n"); goto unlock; } @@ -869,7 +904,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, rdtg->mode != RDT_MODE_EXCLUSIVE) continue; - if (rdtg->closid != tsk->closid) + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) continue; seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", @@ -877,7 +912,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, seq_puts(s, "mon:"); list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, mon.crdtgrp_list) { - if (tsk->rmid != crg->mon.rmid) + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) continue; seq_printf(s, "%s", crg->kn->name); break; @@ -982,6 +1018,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->domains, list) { @@ -1042,6 +1079,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1297,6 +1335,9 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) @@ -1561,6 +1602,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid struct rdt_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); list_for_each_entry(dom, &r->domains, list) { @@ -1577,6 +1619,7 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1614,17 +1657,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; - - /* mon_config cannot be more than the supported set of events */ - if (val > MAX_EVT_CONFIG_BITS) { - rdt_last_cmd_puts("Invalid event configuration\n"); - return -EINVAL; - } /* * Read the current config value first. If both are the same then @@ -1633,7 +1669,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1656,17 +1692,17 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; struct rdt_domain *d; - int ret = 0; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); next: if (!tok || tok[0] == '\0') @@ -1686,11 +1722,16 @@ next: return -EINVAL; } + /* Value from user cannot be more than the supported set of events */ + if ((val & hw_res->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + hw_res->mbm_cfg_mask); + return -EINVAL; + } + list_for_each_entry(d, &r->domains, list) { if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + mbm_config_write_domain(r, d, evtid, val); goto next; } } @@ -1709,6 +1750,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1718,6 +1760,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1733,6 +1776,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1742,6 +1786,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -2218,6 +2263,9 @@ static int set_cache_qos_cfg(int level, bool enable) struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2417,6 +2465,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2434,6 +2483,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2584,7 +2635,7 @@ static int rdt_get_tree(struct fs_context *fc) /* * resctrl file system can only be mounted once. */ - if (static_branch_unlikely(&rdt_enable_key)) { + if (resctrl_mounted) { ret = -EBUSY; goto out; } @@ -2605,7 +2656,7 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) flags |= RFTYPE_MON; ret = rdtgroup_add_files(rdtgroup_default.kn, flags); @@ -2618,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_schemata_free; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = mongroup_create_dir(rdtgroup_default.kn, &rdtgroup_default, "mon_groups", &kn_mongrp); @@ -2640,18 +2691,19 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_psl; - if (rdt_alloc_capable) - static_branch_enable_cpuslocked(&rdt_alloc_enable_key); - if (rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_mon_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -2659,10 +2711,10 @@ static int rdt_get_tree(struct fs_context *fc) out_psl: rdt_pseudo_lock_release(); out_mondata: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); @@ -2765,6 +2817,9 @@ static int reset_all_ctrls(struct rdt_resource *r) struct rdt_domain *d; int i; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) return -ENOMEM; @@ -2810,8 +2865,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - WRITE_ONCE(t->closid, to->closid); - WRITE_ONCE(t->rmid, to->mon.rmid); + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); /* * Order the closid/rmid stores above before the loads @@ -2842,7 +2897,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->mon.rmid); + free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); if (atomic_read(&sentry->waitcount) != 0) @@ -2882,7 +2937,7 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); @@ -2917,9 +2972,11 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - static_branch_disable_cpuslocked(&rdt_alloc_enable_key); - static_branch_disable_cpuslocked(&rdt_mon_enable_key); - static_branch_disable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); @@ -3047,6 +3104,9 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_domain *dom; int ret; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(dom, &r->domains, list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) @@ -3293,6 +3353,36 @@ out: return ret; } +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3353,7 +3443,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) files |= RFTYPE_MON; } else { files = RFTYPE_BASE | RFTYPE_MON; @@ -3365,29 +3455,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rdt_mon_capable) { - ret = alloc_rmid(); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - goto out_destroy; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_idfree; - } - } - kernfs_activate(kn); - /* * The caller unlocks the parent_kn upon success. */ return 0; -out_idfree: - free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); @@ -3401,7 +3473,6 @@ out_unlock: static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); - free_rmid(rgrp->mon.rmid); rdtgroup_remove(rgrp); } @@ -3423,12 +3494,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, prgrp = rdtgrp->mon.parent; rdtgrp->closid = prgrp->closid; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + /* * Add the rdtgrp to the list of rdtgrps the parent * ctrl_mon group has to track. */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); +out_unlock: rdtgroup_kn_unlock(parent_kn); return ret; } @@ -3459,13 +3539,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + ret = rdtgroup_init_alloc(rdtgrp); if (ret < 0) - goto out_id_free; + goto out_rmid_free; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { /* * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. @@ -3481,7 +3568,9 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, out_del_list: list_del(&rdtgrp->rdtgroup_list); -out_id_free: +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: closid_free(closid); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); @@ -3518,14 +3607,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * allocation is supported, add a control and monitoring * subdirectory */ - if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ - if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; @@ -3550,7 +3639,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* * Remove the rdtgrp from the parent ctrl_mon group's list @@ -3596,8 +3685,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); - free_rmid(rdtgrp->mon.rmid); rdtgroup_ctrl_remove(rdtgrp); @@ -3829,8 +3918,8 @@ static void __init rdtgroup_setup_default(void) { mutex_lock(&rdtgroup_mutex); - rdtgroup_default.closid = 0; - rdtgroup_default.mon.rmid = 0; + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; rdtgroup_default.type = RDTCTRL_GROUP; INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); @@ -3848,24 +3937,24 @@ static void domain_destroy_mon_state(struct rdt_domain *d) void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); if (!r->mon_capable) - return; + goto out_unlock; /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + if (resctrl_mounted && resctrl_arch_mon_capable()) rmdir_mondata_subdir_allrdtgrp(r, d->id); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -3879,20 +3968,24 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } if (is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { bitmap_free(d->rmid_busy_llc); return -ENOMEM; @@ -3900,7 +3993,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) } if (is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); @@ -3913,34 +4006,97 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + goto out_unlock; + } if (!r->mon_capable) - return 0; + goto out_unlock; err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - /* If resctrl is mounted, add per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdtgroup *rdtgrp; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_domain_from_cpu(cpu, l3); + if (d) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index dc136703566f..3259b1d4fefe 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,167 +1,510 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only /* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. + * CPU/APIC topology + * + * The APIC IDs describe the system topology in multiple domain levels. + * The CPUID topology parser provides the information which part of the + * APIC ID is associated to the individual levels: + * + * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD] + * + * The root space contains the package (socket) IDs. + * + * Not enumerated levels consume 0 bits space, but conceptually they are + * always represented. If e.g. only CORE and THREAD levels are enumerated + * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE. + * + * If SMT is not supported, then the THREAD domain is still used. It then + * has the same physical ID as the CORE domain and is the only child of + * the core domain. + * + * This allows a unified view on the system independent of the enumerated + * domain levels without requiring any conditionals in the code. */ - +#define pr_fmt(fmt) "CPU topo: " fmt #include <linux/cpu.h> + +#include <xen/xen.h> + #include <asm/apic.h> -#include <asm/memtype.h> -#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/smp.h> #include "cpu.h" -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); -/* extended topology sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define DIE_TYPE 5 +/* Bitmap of physically present CPUs. */ +DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly; -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) +/* Used for CPU number allocation and parallel CPU bringup */ +u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, }; + +/* Bitmaps to mark registered APICs at each topology domain */ +static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init; + +/* + * Keep track of assigned, disabled and rejected CPUs. Present assigned + * with 1 as CPU #0 is reserved for the boot CPU. + */ +static struct { + unsigned int nr_assigned_cpus; + unsigned int nr_disabled_cpus; + unsigned int nr_rejected_cpus; + u32 boot_cpu_apic_id; + u32 real_bsp_apic_id; +} topo_info __ro_after_init = { + .nr_assigned_cpus = 1, + .boot_cpu_apic_id = BAD_APICID, + .real_bsp_apic_id = BAD_APICID, +}; -unsigned int __max_die_per_package __read_mostly = 1; -EXPORT_SYMBOL(__max_die_per_package); +#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC) + +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == (u64)cpuid_to_apicid[cpu]; +} #ifdef CONFIG_SMP +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + if (!(apicid & (__max_threads_per_core - 1))) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); +} +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } +#endif + /* - * Check if given CPUID extended topology "leaf" is implemented + * Convert the APIC ID to a domain level ID by masking out the low bits + * below the domain level @dom. */ -static int check_extended_topology_leaf(int leaf) +static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]); +} + +static int topo_lookup_cpuid(u32 apic_id) { - unsigned int eax, ebx, ecx, edx; + int i; - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + /* CPU# to APICID mapping is persistent once it is established */ + for (i = 0; i < topo_info.nr_assigned_cpus; i++) { + if (cpuid_to_apicid[i] == apic_id) + return i; + } + return -ENODEV; +} - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return -1; +static __init int topo_get_cpunr(u32 apic_id) +{ + int cpu = topo_lookup_cpuid(apic_id); - return 0; + if (cpu >= 0) + return cpu; + + return topo_info.nr_assigned_cpus++; } -/* - * Return best CPUID Extended Topology Leaf supported + +static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; +#endif + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); +} + +static __init bool check_for_real_bsp(u32 apic_id) +{ + /* + * There is no real good way to detect whether this a kdump() + * kernel, but except on the Voyager SMP monstrosity which is not + * longer supported, the real BSP APIC ID is the first one which is + * enumerated by firmware. That allows to detect whether the boot + * CPU is the real BSP. If it is not, then do not register the APIC + * because sending INIT to the real BSP would reset the whole + * system. + * + * The first APIC ID which is enumerated by firmware is detectable + * because the boot CPU APIC ID is registered before that without + * invoking this code. + */ + if (topo_info.real_bsp_apic_id != BAD_APICID) + return false; + + if (apic_id == topo_info.boot_cpu_apic_id) { + topo_info.real_bsp_apic_id = apic_id; + return false; + } + + pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n", + topo_info.boot_cpu_apic_id, apic_id); + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); + + topo_info.real_bsp_apic_id = apic_id; + return true; +} + +static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, + unsigned long *map) +{ + unsigned int id, end, cnt = 0; + + /* Calculate the exclusive end */ + end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]); + + /* Unfortunately there is no bitmap_weight_range() */ + for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id)) + cnt++; + return cnt; +} + +static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + int cpu, dom; + + if (present) { + set_bit(apic_id, phys_cpu_present_map); + + /* + * Double registration is valid in case of the boot CPU + * APIC because that is registered before the enumeration + * of the APICs via firmware parsers or VM guest + * mechanisms. + */ + if (apic_id == topo_info.boot_cpu_apic_id) + cpu = 0; + else + cpu = topo_get_cpunr(apic_id); + + cpuid_to_apicid[cpu] = apic_id; + topo_set_cpuids(cpu, apic_id, acpi_id); + } else { + u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); + + /* + * Check for present APICs in the same package when running + * on bare metal. Allow the bogosity in a guest. + */ + if (hypervisor_is_type(X86_HYPER_NATIVE) && + topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { + pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", + apic_id); + topo_info.nr_rejected_cpus++; + return; + } + + topo_info.nr_disabled_cpus++; + } + + /* Register present and possible CPUs in the domain maps */ + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) + set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); +} + +/** + * topology_register_apic - Register an APIC in early topology maps + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + * @present: True if the corresponding CPU is present */ -static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present) { - if (c->cpuid_level >= 0x1f) { - if (check_extended_topology_leaf(0x1f) == 0) - return 0x1f; + if (apic_id >= MAX_LOCAL_APIC) { + pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1); + topo_info.nr_rejected_cpus++; + return; + } + + if (check_for_real_bsp(apic_id)) { + topo_info.nr_rejected_cpus++; + return; } - if (c->cpuid_level >= 0xb) { - if (check_extended_topology_leaf(0xb) == 0) - return 0xb; + /* CPU numbers exhausted? */ + if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) { + pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids); + topo_info.nr_rejected_cpus++; + return; } - return -1; + topo_register_apic(apic_id, acpi_id, present); +} + +/** + * topology_register_boot_apic - Register the boot CPU APIC + * @apic_id: The APIC ID to set up + * + * Separate so CPU #0 can be assigned + */ +void __init topology_register_boot_apic(u32 apic_id) +{ + WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID); + + topo_info.boot_cpu_apic_id = apic_id; + topo_register_apic(apic_id, CPU_ACPIID_INVALID, true); +} + +/** + * topology_get_logical_id - Retrieve the logical ID at a given topology domain level + * @apicid: The APIC ID for which to lookup the logical ID + * @at_level: The topology domain level to use + * + * @apicid must be a full APIC ID, not the normalized variant. It's valid to have + * all bits below the domain level specified by @at_level to be clear. So both + * real APIC IDs and backshifted normalized APIC IDs work correctly. + * + * Returns: + * - >= 0: The requested logical ID + * - -ERANGE: @apicid is out of range + * - -ENODEV: @apicid is not registered + */ +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return -ERANGE; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return -ENODEV; + /* Get the number of set bits before @lvlid. */ + return bitmap_weight(apic_maps[at_level].map, lvlid); +} +EXPORT_SYMBOL_GPL(topology_get_logical_id); + +/** + * topology_unit_count - Retrieve the count of specified units at a given topology domain level + * @apicid: The APIC ID which specifies the search range + * @which_units: The domain level specifying the units to count + * @at_level: The domain level at which @which_units have to be counted + * + * This returns the number of possible units according to the enumerated + * information. + * + * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN) + * counts the number of possible cores in the package to which @apicid + * belongs. + * + * @at_level must obviously be greater than @which_level to produce useful + * results. If @at_level is equal to @which_units the result is + * unsurprisingly 1. If @at_level is less than @which_units the results + * is by definition undefined and the function returns 0. + */ +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return 0; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return 0; + if (which_units > at_level) + return 0; + if (which_units == at_level) + return 1; + return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +/** + * topology_hotplug_apic - Handle a physical hotplugged APIC after boot + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + */ +int topology_hotplug_apic(u32 apic_id, u32 acpi_id) +{ + int cpu; + + if (apic_id >= MAX_LOCAL_APIC) + return -EINVAL; + + /* Reject if the APIC ID was not registered during enumeration. */ + if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map)) + return -ENODEV; + + cpu = topo_lookup_cpuid(apic_id); + if (cpu < 0) + return -ENOSPC; + + set_bit(apic_id, phys_cpu_present_map); + topo_set_cpuids(cpu, apic_id, acpi_id); + cpu_mark_primary_thread(cpu, apic_id); + return cpu; +} + +/** + * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot + * @cpu: The CPU number for which the APIC ID is removed + */ +void topology_hotunplug_apic(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + if (apic_id == BAD_APICID) + return; + + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; + clear_bit(apic_id, phys_cpu_present_map); + set_cpu_present(cpu, false); } #endif -int detect_extended_topology_early(struct cpuinfo_x86 *c) +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int max_possible_cpus __initdata = NR_CPUS; + +/** + * topology_apply_cmdline_limits_early - Apply topology command line limits early + * + * Ensure that command line limits are in effect before firmware parsing + * takes place. + */ +void __init topology_apply_cmdline_limits_early(void) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx; - int leaf; + unsigned int possible = nr_cpu_ids; - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ + if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) + possible = 1; - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + /* 'possible_cpus=N' */ + possible = min_t(unsigned int, max_possible_cpus, possible); + + if (possible < nr_cpu_ids) { + pr_info("Limiting to %u possible CPUs\n", possible); + set_nr_cpu_ids(possible); + } +} - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); +static __init bool restrict_to_up(void) +{ + if (!smp_found_config || ioapic_is_disabled) + return true; /* - * initial apic id, which also represents 32-bit extended x2apic id. + * XEN PV is special as it does not advertise the local APIC + * properly, but provides a fake topology for it so that the + * infrastructure works. So don't apply the restrictions vs. APIC + * here. */ - c->topo.initial_apicid = edx; - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); -#endif - return 0; + if (xen_pv_domain()) + return false; + + return apic_is_disabled; } -/* - * Check for extended topology enumeration cpuid leaf, and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +void __init topology_init_possible_cpus(void) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - unsigned int die_select_mask, die_level_siblings; - unsigned int pkg_mask_width; - bool die_level_present = false; - int leaf; - - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + unsigned int assigned = topo_info.nr_assigned_cpus; + unsigned int disabled = topo_info.nr_disabled_cpus; + unsigned int cnta, cntb, cpu, allowed = 1; + unsigned int total = assigned + disabled; + u32 apicid, firstid; + + if (!restrict_to_up()) { + if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { + disabled += assigned - nr_cpu_ids; + assigned = nr_cpu_ids; + } + allowed = min_t(unsigned int, total, nr_cpu_ids); + } + + if (total > allowed) + pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed); + + assigned = min_t(unsigned int, allowed, assigned); + disabled = allowed - assigned; + topo_info.nr_assigned_cpus = assigned; + topo_info.nr_disabled_cpus = disabled; + + total_cpus = allowed; + set_nr_cpu_ids(allowed); + + cnta = domain_weight(TOPO_PKG_DOMAIN); + cntb = domain_weight(TOPO_DIE_DOMAIN); + __max_logical_packages = cnta; + __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); + + pr_info("Max. logical packages: %3u\n", cnta); + pr_info("Max. logical dies: %3u\n", cntb); + pr_info("Max. dies per package: %3u\n", __max_dies_per_package); + + cnta = domain_weight(TOPO_CORE_DOMAIN); + cntb = domain_weight(TOPO_SMT_DOMAIN); /* - * Populate HT related information from sub-leaf level 0. + * Can't use order delta here as order(cnta) can be equal + * order(cntb) even if cnta != cntb. */ - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->topo.initial_apicid = edx; - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - while (true) { - cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + __max_threads_per_core = DIV_ROUND_UP(cntb, cnta); + pr_info("Max. threads per core: %3u\n", __max_threads_per_core); - /* - * Check for the Core type in the implemented sub leaves. - */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = core_level_siblings; - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } - if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { - die_level_present = true; - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } + firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC); + __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. cores per package: %3u\n", __num_cores_per_package); + __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. threads per package: %3u\n", __num_threads_per_package); - if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) - pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - else - break; + pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled); + if (topo_info.nr_rejected_cpus) + pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus); - sub_index++; + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + /* Assign CPU numbers to non-present CPUs */ + for (apicid = 0; disabled; disabled--, apicid++) { + apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map, + MAX_LOCAL_APIC, apicid); + if (apicid >= MAX_LOCAL_APIC) + break; + cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid; } - core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; - die_select_mask = (~(-1 << die_plus_mask_width)) >> - core_plus_mask_width; + for (cpu = 0; cpu < allowed; cpu++) { + apicid = cpuid_to_apicid[cpu]; - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, - ht_mask_width) & core_select_mask; + set_cpu_possible(cpu, true); - if (die_level_present) { - c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, - core_plus_mask_width) & die_select_mask; + if (apicid == BAD_APICID) + continue; + + cpu_mark_primary_thread(cpu, apicid); + set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map)); } +} - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); - /* - * Reinit the apicid, now that we have extended initial_apicid. - */ - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); +/* + * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed. + */ +void __init topology_reset_possible_cpus_up(void) +{ + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - __max_die_per_package = (die_level_siblings / core_level_siblings); -#endif + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + if (topo_info.boot_cpu_apic_id != BAD_APICID) + set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map); +} + +static int __init setup_possible_cpus(char *str) +{ + get_option(&str, &max_possible_cpus); return 0; } +early_param("possible_cpus", setup_possible_cpus); +#endif diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h new file mode 100644 index 000000000000..37326297f80c --- /dev/null +++ b/arch/x86/kernel/cpu/topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_TOPOLOGY_H +#define ARCH_X86_TOPOLOGY_H + +struct topo_scan { + struct cpuinfo_x86 *c; + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_ncpus[TOPO_MAX_DOMAIN]; + + /* Legacy CPUID[1]:EBX[23:16] number of logical processors */ + unsigned int ebx1_nproc_shift; + + /* AMD specific node ID which cannot be mapped into APIC space. */ + u16 amd_nodes_per_pkg; + u16 amd_node_id; +}; + +void cpu_init_topology(struct cpuinfo_x86 *c); +void cpu_parse_topology(struct cpuinfo_x86 *c); +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus); +bool cpu_parse_topology_ext(struct topo_scan *tscan); +void cpu_parse_topology_amd(struct topo_scan *tscan); +void cpu_topology_fixup_amd(struct topo_scan *tscan); + +static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid >> x86_topo_system.dom_shifts[dom - 1]; +} + +static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom) +{ + if (dom != TOPO_SMT_DOMAIN) + apicid >>= x86_topo_system.dom_shifts[dom - 1]; + return apicid & (x86_topo_system.dom_size[dom] - 1); +} + +static inline u32 topo_domain_mask(enum x86_topology_domains dom) +{ + return (1U << x86_topo_system.dom_shifts[dom]) - 1; +} + +/* + * Update a domain level after the fact without propagating. Used to fixup + * broken CPUID enumerations. + */ +static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + tscan->dom_shifts[dom] = shift; + tscan->dom_ncpus[dom] = ncpus; +} + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level); +#else +static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + return 1; +} +#endif + +#endif /* ARCH_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c new file mode 100644 index 000000000000..1a8b3ad493af --- /dev/null +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -0,0 +1,183 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +static bool parse_8000_0008(struct topo_scan *tscan) +{ + struct { + // ecx + u32 cpu_nthreads : 8, // Number of physical threads - 1 + : 4, // Reserved + apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID + perf_tsc_len : 2, // Performance time-stamp counter size + : 14; // Reserved + } ecx; + unsigned int sft; + + if (tscan->c->extended_cpuid_level < 0x80000008) + return false; + + cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx); + + /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */ + sft = ecx.apicid_coreid_len; + if (!sft) + sft = get_count_order(ecx.cpu_nthreads + 1); + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + return true; +} + +static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +{ + /* + * Starting with Fam 17h the DIE domain could probably be used to + * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps + * suggests it's the topmost bit(s) of the CPU cores area, but + * that's guess work and neither enumerated nor documented. + * + * Up to Fam 16h this does not work at all and the legacy node ID + * has to be used. + */ + tscan->amd_nodes_per_pkg = nr_nodes; + tscan->amd_node_id = node_id; +} + +static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) +{ + struct { + // eax + u32 ext_apic_id : 32; // Extended APIC ID + // ebx + u32 core_id : 8, // Unique per-socket logical core unit ID + core_nthreads : 8, // #Threads per core (zero-based) + : 16; // Reserved + // ecx + u32 node_id : 8, // Node (die) ID of invoking logical CPU + nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket + : 21; // Reserved + // edx + u32 : 32; // Reserved + } leaf; + + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) + return false; + + cpuid_leaf(0x8000001e, &leaf); + + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + + /* + * If leaf 0xb is available, then SMT shift is set already. If not + * take it from ecx.threads_per_core and use topo_update_dom() - + * topology_set_dom() would propagate and overwrite the already + * propagated CORE level. + */ + if (!has_0xb) { + unsigned int nthreads = leaf.core_nthreads + 1; + + topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + } + + store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); + + if (tscan->c->x86_vendor == X86_VENDOR_AMD) { + if (tscan->c->x86 == 0x15) + tscan->c->topo.cu_id = leaf.core_id; + + cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id); + } else { + /* + * Package ID is ApicId[6..] on certain Hygon CPUs. See + * commit e0ceeae708ce for explanation. The topology info + * is screwed up: The package shift is always 6 and the + * node ID is bit [4:5]. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) { + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6, + tscan->dom_ncpus[TOPO_CORE_DOMAIN]); + } + cacheinfo_hygon_init_llc_id(tscan->c); + } + return true; +} + +static bool parse_fam10h_node_id(struct topo_scan *tscan) +{ + struct { + union { + u64 node_id : 3, + nodes_per_pkg : 3, + unused : 58; + u64 msr; + }; + } nid; + + if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) + return false; + + rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); + tscan->c->topo.llc_id = nid.node_id; + return true; +} + +static void legacy_set_llc(struct topo_scan *tscan) +{ + unsigned int apicid = tscan->c->topo.initial_apicid; + + /* parse_8000_0008() set everything up except llc_id */ + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void parse_topology_amd(struct topo_scan *tscan) +{ + bool has_0xb = false; + + /* + * If the extended topology leaf 0x8000_001e is available + * try to get SMT and CORE shift from leaf 0xb first, then + * try to get the CORE shift from leaf 0x8000_0008. + */ + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) + has_0xb = cpu_parse_topology_ext(tscan); + + if (!has_0xb && !parse_8000_0008(tscan)) + return; + + /* Prefer leaf 0x8000001e if available */ + if (parse_8000_001e(tscan, has_0xb)) + return; + + /* Try the NODEID MSR */ + if (parse_fam10h_node_id(tscan)) + return; + + legacy_set_llc(tscan); +} + +void cpu_parse_topology_amd(struct topo_scan *tscan) +{ + tscan->amd_nodes_per_pkg = 1; + parse_topology_amd(tscan); + + if (tscan->amd_nodes_per_pkg > 1) + set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); +} + +void cpu_topology_fixup_amd(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + + /* + * Adjust the core_id relative to the node when there is more than + * one node. + */ + if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1) + c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c new file mode 100644 index 000000000000..a50ae8d63d1c --- /dev/null +++ b/arch/x86/kernel/cpu/topology_common.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <xen/xen.h> + +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/smp.h> + +#include "cpu.h" + +struct x86_topology_system x86_topo_system __ro_after_init; +EXPORT_SYMBOL_GPL(x86_topo_system); + +unsigned int __amd_nodes_per_pkg __ro_after_init; +EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); + +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + topology_update_dom(tscan, dom, shift, ncpus); + + /* Propagate to the upper levels */ + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1]; + tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1]; + } +} + +static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) +{ + struct { + u32 cache_type : 5, + unused : 21, + ncores : 6; + } eax; + + if (c->cpuid_level < 4) + return 1; + + cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax); + if (!eax.cache_type) + return 1; + + return eax.ncores + 1; +} + +static void parse_legacy(struct topo_scan *tscan) +{ + unsigned int cores, core_shift, smt_shift = 0; + struct cpuinfo_x86 *c = tscan->c; + + cores = parse_num_cores_legacy(c); + core_shift = get_count_order(cores); + + if (cpu_has(c, X86_FEATURE_HT)) { + if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift)) + smt_shift = tscan->ebx1_nproc_shift - core_shift; + /* + * The parser expects leaf 0xb/0x1f format, which means + * the number of logical processors at core level is + * counting threads. + */ + core_shift += smt_shift; + cores <<= smt_shift; + } + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores); +} + +static bool fake_topology(struct topo_scan *tscan) +{ + /* + * Preset the CORE level shift for CPUID less systems and XEN_PV, + * which has useless CPUID information. + */ + topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1); + + return tscan->c->cpuid_level < 1; +} + +static void parse_topology(struct topo_scan *tscan, bool early) +{ + const struct cpuinfo_topology topo_defaults = { + .cu_id = 0xff, + .llc_id = BAD_APICID, + .l2c_id = BAD_APICID, + }; + struct cpuinfo_x86 *c = tscan->c; + struct { + u32 unused0 : 16, + nproc : 8, + apicid : 8; + } ebx; + + c->topo = topo_defaults; + + if (fake_topology(tscan)) + return; + + /* Preset Initial APIC ID from CPUID leaf 1 */ + cpuid_leaf_reg(1, CPUID_EBX, &ebx); + c->topo.initial_apicid = ebx.apicid; + + /* + * The initial invocation from early_identify_cpu() happens before + * the APIC is mapped or X2APIC enabled. For establishing the + * topology, that's not required. Use the initial APIC ID. + */ + if (early) + c->topo.apicid = c->topo.initial_apicid; + else + c->topo.apicid = read_apic_id(); + + /* The above is sufficient for UP */ + if (!IS_ENABLED(CONFIG_SMP)) + return; + + tscan->ebx1_nproc_shift = get_count_order(ebx.nproc); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (IS_ENABLED(CONFIG_CPU_SUP_AMD)) + cpu_parse_topology_amd(tscan); + break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_ZHAOXIN: + parse_legacy(tscan); + break; + case X86_VENDOR_INTEL: + if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) + parse_legacy(tscan); + break; + case X86_VENDOR_HYGON: + if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) + cpu_parse_topology_amd(tscan); + break; + } +} + +static void topo_set_ids(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u32 apicid = c->topo.apicid; + + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); + c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); + + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + + /* Package relative core ID */ + c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> + x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN]; + + c->topo.amd_node_id = tscan->amd_node_id; + + if (c->x86_vendor == X86_VENDOR_AMD) + cpu_topology_fixup_amd(tscan); +} + +void cpu_parse_topology(struct cpuinfo_x86 *c) +{ + unsigned int dom, cpu = smp_processor_id(); + struct topo_scan tscan = { .c = c, }; + + parse_topology(&tscan, false); + + if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (c->topo.initial_apicid != c->topo.apicid) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n", + cpu, c->topo.initial_apicid, c->topo.apicid); + } + + if (c->topo.apicid != cpuid_to_apicid[cpu]) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n", + cpu, cpuid_to_apicid[cpu], c->topo.apicid); + } + } + + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) { + if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom]) + continue; + pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom, + tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); + } + + topo_set_ids(&tscan); +} + +void __init cpu_init_topology(struct cpuinfo_x86 *c) +{ + struct topo_scan tscan = { .c = c, }; + unsigned int dom, sft; + + parse_topology(&tscan, true); + + /* Copy the shift values and calculate the unit sizes. */ + memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts)); + + dom = TOPO_SMT_DOMAIN; + x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom]; + + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1]; + x86_topo_system.dom_size[dom] = 1U << sft; + } + + topo_set_ids(&tscan); + + /* + * AMD systems have Nodes per package which cannot be mapped to + * APIC ID. + */ + __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c new file mode 100644 index 000000000000..e477228cd5b2 --- /dev/null +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +enum topo_types { + INVALID_TYPE = 0, + SMT_TYPE = 1, + CORE_TYPE = 2, + MAX_TYPE_0B = 3, + MODULE_TYPE = 3, + TILE_TYPE = 4, + DIE_TYPE = 5, + DIEGRP_TYPE = 6, + MAX_TYPE_1F = 7, +}; + +/* + * Use a lookup table for the case that there are future types > 6 which + * describe an intermediate domain level which does not exist today. + */ +static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [MODULE_TYPE] = TOPO_MODULE_DOMAIN, + [TILE_TYPE] = TOPO_TILE_DOMAIN, + [DIE_TYPE] = TOPO_DIE_DOMAIN, + [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, +}; + +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, + unsigned int *last_dom) +{ + unsigned int dom, maxtype; + const unsigned int *map; + struct { + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + // ecx + u32 level : 8, // Current topology level. Same as sub leaf number + type : 8, // Level type. If 0, invalid + : 16; // Reserved + // edx + u32 x2apic_id : 32; // X2APIC ID of the current logical processor + } sl; + + switch (leaf) { + case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; + case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + default: return false; + } + + cpuid_subleaf(leaf, subleaf, &sl); + + if (!sl.num_processors || sl.type == INVALID_TYPE) + return false; + + if (sl.type >= maxtype) { + pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n", + leaf, subleaf, sl.type); + /* + * It really would have been too obvious to make the domain + * type space sparse and leave a few reserved types between + * the points which might change instead of following the + * usual "this can be fixed in software" principle. + */ + dom = *last_dom + 1; + } else { + dom = map[sl.type]; + *last_dom = dom; + } + + if (!dom) { + tscan->c->topo.initial_apicid = sl.x2apic_id; + } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) { + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n", + leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id); + } + + topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors); + return true; +} + +static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf) +{ + unsigned int last_dom; + u32 subleaf; + + /* Read all available subleafs and populate the levels */ + for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++); + + /* If subleaf 0 failed to parse, give up */ + if (!subleaf) + return false; + + /* + * There are machines in the wild which have shift 0 in the subleaf + * 0, but advertise 2 logical processors at that level. They are + * truly SMT. + */ + if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) { + unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n", + leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + } + + set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY); + return true; +} + +bool cpu_parse_topology_ext(struct topo_scan *tscan) +{ + /* Intel: Try leaf 0x1F first. */ + if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) + return true; + + /* Intel/AMD: Fall back to leaf 0xB if available */ + return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); +} diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 415564a6523b..90eba7eb5335 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -71,10 +71,6 @@ static void init_zhaoxin(struct cpuinfo_x86 *c) { early_init_zhaoxin(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); |