diff options
Diffstat (limited to 'arch/x86/kernel')
171 files changed, 10796 insertions, 10724 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0000325ab98f..b43eb7e384eb 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,7 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_head32.o = -pg -CFLAGS_REMOVE_sev.o = -pg CFLAGS_REMOVE_rethook.o = -pg endif @@ -26,7 +25,6 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. @@ -37,9 +35,16 @@ KMSAN_SANITIZE_nmi.o := n # If instrumentation of the following files is enabled, boot hangs during # first second. KCOV_INSTRUMENT_head$(BITS).o := n -KCOV_INSTRUMENT_sev.o := n +# These are called from save_stack_trace() on debug paths, +# and produce large amounts of uninteresting coverage. +KCOV_INSTRUMENT_stacktrace.o := n +KCOV_INSTRUMENT_dumpstack.o := n +KCOV_INSTRUMENT_dumpstack_$(BITS).o := n +KCOV_INSTRUMENT_unwind_orc.o := n +KCOV_INSTRUMENT_unwind_frame.o := n +KCOV_INSTRUMENT_unwind_guess.o := n -CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace +CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o obj-y += head$(BITS).o @@ -48,6 +53,7 @@ obj-y += platform-quirks.o obj-y += process_$(BITS).o signal.o signal_$(BITS).o obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_X86_FRED) += fred.o obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o obj-y += setup.o x86_init.o i8259.o irqinit.o @@ -60,7 +66,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += resource.o @@ -98,11 +104,11 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_RETHOOK) += rethook.o -obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o +obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o -obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_X86_32) += doublefault_32.o @@ -113,6 +119,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_AMD_NODE) += amd_node.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o @@ -140,8 +147,6 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o - obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index fc17b3f136fe..842a5f449404 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o +obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 85a3ce2a3666..dae6a73be40e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata; static bool acpi_support_online_capable; #endif -#ifdef CONFIG_X86_64 -/* Physical address of the Multiprocessor Wakeup Structure mailbox */ -static u64 acpi_mp_wake_mailbox_paddr; -/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ -static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; -#endif - #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -164,35 +157,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return 0; } -/** - * acpi_register_lapic - register a local apic and generates a logic cpu number - * @id: local apic id to register - * @acpiid: ACPI id to register - * @enabled: this cpu is enabled or not - * - * Returns the logic cpu number which maps to the local apic - */ -static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) -{ - int cpu; - - if (id >= MAX_LOCAL_APIC) { - pr_info("skipped apicid that is too big\n"); - return -EINVAL; - } - - if (!enabled) { - ++disabled_cpus; - return -EINVAL; - } - - cpu = generic_processor_info(id); - if (cpu >= 0) - early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; - - return cpu; -} - static bool __init acpi_is_processor_usable(u32 lapic_flags) { if (lapic_flags & ACPI_MADT_ENABLED) @@ -254,7 +218,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) return 0; } - acpi_register_lapic(apic_id, processor->uid, enabled); + topology_register_apic(apic_id, processor->uid, enabled); #else pr_warn("x2apic entry ignored\n"); #endif @@ -263,6 +227,28 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) } static int __init +acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + + /* Ignore processors that can not be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + has_lapic_cpus = true; + return 0; +} + +static int __init acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic *processor = NULL; @@ -289,11 +275,10 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->id, /* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); - has_lapic_cpus = true; return 0; } @@ -309,9 +294,9 @@ acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end) acpi_table_print_madt_entry(&header->common); - acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ - processor->processor_id, /* ACPI ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); return 0; } @@ -370,60 +355,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } - -#ifdef CONFIG_X86_64 -static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) -{ - /* - * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). - * - * Wakeup of secondary CPUs is fully serialized in the core code. - * No need to protect acpi_mp_wake_mailbox from concurrent accesses. - */ - if (!acpi_mp_wake_mailbox) { - acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, - sizeof(*acpi_mp_wake_mailbox), - MEMREMAP_WB); - } - - /* - * Mailbox memory is shared between the firmware and OS. Firmware will - * listen on mailbox command address, and once it receives the wakeup - * command, the CPU associated with the given apicid will be booted. - * - * The value of 'apic_id' and 'wakeup_vector' must be visible to the - * firmware before the wakeup command is visible. smp_store_release() - * ensures ordering and visibility. - */ - acpi_mp_wake_mailbox->apic_id = apicid; - acpi_mp_wake_mailbox->wakeup_vector = start_ip; - smp_store_release(&acpi_mp_wake_mailbox->command, - ACPI_MP_WAKE_COMMAND_WAKEUP); - - /* - * Wait for the CPU to wake up. - * - * The CPU being woken up is essentially in a spin loop waiting to be - * woken up. It should not take long for it wake up and acknowledge by - * zeroing out ->command. - * - * ACPI specification doesn't provide any guidance on how long kernel - * has to wait for a wake up acknowledgement. It also doesn't provide - * a way to cancel a wake up request if it takes too long. - * - * In TDX environment, the VMM has control over how long it takes to - * wake up secondary. It can postpone scheduling secondary vCPU - * indefinitely. Giving up on wake up request and reporting error opens - * possible attack vector for VMM: it can wake up a secondary CPU when - * kernel doesn't expect it. Wait until positive result of the wake up - * request. - */ - while (READ_ONCE(acpi_mp_wake_mailbox->command)) - cpu_relax(); - - return 0; -} -#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -844,12 +775,10 @@ static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) return 0; } -int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, - int *pcpu) +int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu) { - int cpu; + int cpu = topology_hotplug_apic(physid, acpi_id); - cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info("Unable to map lapic to logical cpu number\n"); return cpu; @@ -868,15 +797,11 @@ int acpi_unmap_cpu(int cpu) #ifdef CONFIG_ACPI_NUMA set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); #endif - - per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; - set_cpu_present(cpu, false); - num_processors--; - - return (0); + topology_hotunplug_apic(cpu); + return 0; } EXPORT_SYMBOL(acpi_unmap_cpu); -#endif /* CONFIG_ACPI_HOTPLUG_CPU */ +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) { @@ -1007,11 +932,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * the resource tree during the lateinit timeframe. */ #define HPET_RESOURCE_NAME_SIZE 9 - hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, SMP_CACHE_BYTES); - if (!hpet_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; @@ -1125,6 +1047,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { int count, x2count = 0; + struct acpi_subtable_proc madt_proc[2]; + int ret; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1133,10 +1057,27 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); + /* Check if there are valid LAPIC entries */ + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC); + + /* + * Enumerate the APIC IDs in the order that they appear in the + * MADT, no matter LAPIC entry or x2APIC entry is used. + */ + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + pr_err("Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); @@ -1159,29 +1100,6 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } - -#ifdef CONFIG_X86_64 -static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_madt_multiproc_wakeup *mp_wake; - - if (!IS_ENABLED(CONFIG_SMP)) - return -ENODEV; - - mp_wake = (struct acpi_madt_multiproc_wakeup *)header; - if (BAD_MADT_ENTRY(mp_wake, end)) - return -EINVAL; - - acpi_table_print_madt_entry(&header->common); - - acpi_mp_wake_mailbox_paddr = mp_wake->base_address; - - apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); - - return 0; -} -#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1290,7 +1208,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, - acpi_parse_int_src_ovr, nr_irqs); + acpi_parse_int_src_ovr, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1310,7 +1229,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) mp_config_acpi_legacy_irqs(); count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, - acpi_parse_nmi_src, nr_irqs); + acpi_parse_nmi_src, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1378,7 +1298,7 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } -#ifdef CONFIG_X86_64 +#ifdef CONFIG_ACPI_MADT_WAKEUP /* * Parse MADT MP Wake entry. */ @@ -1897,3 +1817,14 @@ u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; } + +#ifdef CONFIG_XEN_PV +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + return ioremap_cache(phys, size); +} + +void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) = + x86_acpi_os_ioremap; +EXPORT_SYMBOL_GPL(acpi_os_ioremap); +#endif diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 8d8752b44f11..d745dd586303 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -9,6 +9,17 @@ #include <asm/processor.h> #include <asm/topology.h> +#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +#define CPPC_HIGHEST_PERF_PREFCORE 166 + +enum amd_pref_core { + AMD_PREF_CORE_UNKNOWN = 0, + AMD_PREF_CORE_SUPPORTED, + AMD_PREF_CORE_UNSUPPORTED, +}; +static enum amd_pref_core amd_pref_core_detected; +static u64 boost_numerator; + /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ bool cpc_supported_by_cpu(void) @@ -20,7 +31,7 @@ bool cpc_supported_by_cpu(void) (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f))) return true; else if (boot_cpu_data.x86 == 0x17 && - boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f) + boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f) return true; return boot_cpu_has(X86_FEATURE_CPPC); } @@ -69,38 +80,37 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; - u64 highest_perf, nominal_perf; + u64 numerator, nominal_perf; u64 perf_ratio; int rc; rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_debug("Could not retrieve perf counters (%d)\n", rc); + pr_warn("Could not retrieve perf counters (%d)\n", rc); return; } - highest_perf = amd_get_highest_perf(); + rc = amd_get_boost_ratio_numerator(0, &numerator); + if (rc) { + pr_warn("Could not retrieve highest performance (%d)\n", rc); + return; + } nominal_perf = perf_caps.nominal_perf; - if (!highest_perf || !nominal_perf) { - pr_debug("Could not retrieve highest or nominal performance\n"); + if (!nominal_perf) { + pr_warn("Could not retrieve nominal performance\n"); return; } - perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return; - } + perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); -void init_freq_invariance_cppc(void) +static inline void init_freq_invariance_cppc(void) { static bool init_done; @@ -116,3 +126,171 @@ void init_freq_invariance_cppc(void) init_done = true; mutex_unlock(&freq_invariance_lock); } + +void acpi_processor_init_invariance_cppc(void) +{ + init_freq_invariance_cppc(); +} + +/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address for highest performance value. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) +{ + u64 val; + int ret; + + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + if (ret) + goto out; + + val = AMD_CPPC_HIGHEST_PERF(val); + } else { + ret = cppc_get_highest_perf(cpu, &val); + if (ret) + goto out; + } + + WRITE_ONCE(*highest_perf, (u32)val); +out: + return ret; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +/** + * amd_detect_prefcore: Detect if CPUs in the system support preferred cores + * @detected: Output variable for the result of the detection. + * + * Determine whether CPUs in the system support preferred cores. On systems + * that support preferred cores, different highest perf values will be found + * on different cores. On other systems, the highest perf value will be the + * same on all cores. + * + * The result of the detection will be stored in the 'detected' parameter. + * + * Return: 0 for success, negative error code otherwise + */ +int amd_detect_prefcore(bool *detected) +{ + int cpu, count = 0; + u64 highest_perf[2] = {0}; + + if (WARN_ON(!detected)) + return -EINVAL; + + switch (amd_pref_core_detected) { + case AMD_PREF_CORE_SUPPORTED: + *detected = true; + return 0; + case AMD_PREF_CORE_UNSUPPORTED: + *detected = false; + return 0; + default: + break; + } + + for_each_present_cpu(cpu) { + u32 tmp; + int ret; + + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + + if (!count || (count == 1 && tmp != highest_perf[0])) + highest_perf[count++] = tmp; + + if (count == 2) + break; + } + + *detected = (count == 2); + boost_numerator = highest_perf[0]; + + amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : + AMD_PREF_CORE_UNSUPPORTED; + + pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", + *detected ? "" : "un", highest_perf[0]); + + return 0; +} +EXPORT_SYMBOL_GPL(amd_detect_prefcore); + +/** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. + * @numerator: Output variable for numerator. + * + * Determine the numerator to use for calculating the boost ratio on + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * + * If booting the system with amd-pstate enabled but preferred cores disabled then + * the correct boost numerator will be returned to match hardware capabilities + * even if the preferred cores scheduling hints are not enabled. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) +{ + enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); + bool prefcore; + int ret; + u32 tmp; + + ret = amd_detect_prefcore(&prefcore); + if (ret) + return ret; + + /* without preferred cores, return the highest perf register value */ + if (!prefcore) { + *numerator = boost_numerator; + return 0; + } + + /* + * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, + * the highest performance level is set to 196. + * https://bugzilla.kernel.org/show_bug.cgi?id=218759 + */ + if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { + switch (boot_cpu_data.x86_model) { + case 0x70 ... 0x7f: + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + default: + break; + } + } + + /* detect if running on heterogeneous design */ + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { + switch (core_type) { + case TOPO_CPU_TYPE_UNKNOWN: + pr_warn("Undefined core type found for cpu %d\n", cpu); + break; + case TOPO_CPU_TYPE_PERFORMANCE: + /* use the max scale for performance cores */ + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + case TOPO_CPU_TYPE_EFFICIENCY: + /* use the highest perf value for efficiency cores */ + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + *numerator = tmp; + return 0; + } + } + + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 401808b47af3..5854f0b8f0f1 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <acpi/processor.h> +#include <asm/cpuid.h> #include <asm/mwait.h> #include <asm/special_insns.h> @@ -128,11 +129,11 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) unsigned int cstate_type; /* C-state type and not ACPI C-state type */ unsigned int num_cstate_subtype; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* Check whether this particular cx_type (in CST) is supported or not */ - cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & - MWAIT_CSTATE_MASK) + 1; + cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK; edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; @@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); long retval; - if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT) return -1; if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S new file mode 100644 index 000000000000..4e498d28cdc8 --- /dev/null +++ b/arch/x86/kernel/acpi/madt_playdead.S @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/nospec-branch.h> +#include <asm/page_types.h> +#include <asm/processor-flags.h> + + .text + .align PAGE_SIZE + +/* + * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS + * + * rdi: Address of the ACPI MADT MPWK ResetVector + * rsi: PGD of the identity mapping + */ +SYM_FUNC_START(asm_acpi_mp_play_dead) + /* Turn off global entries. Following CR3 write will flush them. */ + movq %cr4, %rdx + andq $~(X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* Switch to identity mapping */ + movq %rsi, %cr3 + + /* Jump to reset vector */ + ANNOTATE_RETPOLINE_SAFE + jmp *%rdi +SYM_FUNC_END(asm_acpi_mp_play_dead) diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c new file mode 100644 index 000000000000..d5ef6215583b --- /dev/null +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/acpi.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kexec.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> +#include <linux/sched/hotplug.h> +#include <asm/apic.h> +#include <asm/barrier.h> +#include <asm/init.h> +#include <asm/intel_pt.h> +#include <asm/nmi.h> +#include <asm/processor.h> +#include <asm/reboot.h> + +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; + +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; + +static u64 acpi_mp_pgd __ro_after_init; +static u64 acpi_mp_reset_vector_paddr __ro_after_init; + +static void acpi_mp_stop_this_cpu(void) +{ + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); +} + +static void acpi_mp_play_dead(void) +{ + play_dead_common(); + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); +} + +static void acpi_mp_cpu_die(unsigned int cpu) +{ + u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned long timeout; + + /* + * Use TEST mailbox command to prove that BIOS got control over + * the CPU before declaring it dead. + * + * BIOS has to clear 'command' field of the mailbox. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_TEST); + + /* Don't wait longer than a second. */ + timeout = USEC_PER_SEC; + while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) + udelay(1); + + if (!timeout) + pr_err("Failed to hand over CPU %d to BIOS\n", cpu); +} + +/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ +static void __init *alloc_pgt_page(void *dummy) +{ + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static void __init free_pgt_page(void *pgt, void *dummy) +{ + return memblock_free(pgt, PAGE_SIZE); +} + +/* + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at + * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches + * to the identity mapping and the function has be present at the same spot in + * the virtual address space before and after switching page tables. + */ +static int __init init_transition_pgtable(pgd_t *pgd) +{ + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + vaddr = (unsigned long)asm_acpi_mp_play_dead; + pgd += pgd_index(vaddr); + if (!pgd_present(*pgd)) { + p4d = (p4d_t *)alloc_pgt_page(NULL); + if (!p4d) + return -ENOMEM; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { + pud = (pud_t *)alloc_pgt_page(NULL); + if (!pud) + return -ENOMEM; + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(p4d, vaddr); + if (!pud_present(*pud)) { + pmd = (pmd_t *)alloc_pgt_page(NULL); + if (!pmd) + return -ENOMEM; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, vaddr); + if (!pmd_present(*pmd)) { + pte = (pte_t *)alloc_pgt_page(NULL); + if (!pte) + return -ENOMEM; + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + } + pte = pte_offset_kernel(pmd, vaddr); + + paddr = __pa(vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); + + return 0; +} + +static int __init acpi_mp_setup_reset(u64 reset_vector) +{ + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .free_pgt_page = free_pgt_page, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, + }; + pgd_t *pgd; + + pgd = alloc_pgt_page(NULL); + if (!pgd) + return -ENOMEM; + + for (int i = 0; i < nr_pfn_mapped; i++) { + unsigned long mstart, mend; + + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + } + + if (kernel_ident_mapping_init(&info, pgd, + PAGE_ALIGN_DOWN(reset_vector), + PAGE_ALIGN(reset_vector + 1))) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + + if (init_transition_pgtable(pgd)) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + + smp_ops.play_dead = acpi_mp_play_dead; + smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; + smp_ops.cpu_die = acpi_mp_cpu_die; + + acpi_mp_reset_vector_paddr = reset_vector; + acpi_mp_pgd = __pa(pgd); + + return 0; +} + +static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) +{ + if (!acpi_mp_wake_mailbox_paddr) { + pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); + return -EOPNOTSUPP; + } + + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgment. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} + +static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) +{ + cpu_hotplug_disable_offlining(); + + /* + * ACPI MADT doesn't allow to offline a CPU after it was onlined. This + * limits kexec: the second kernel won't be able to use more than one CPU. + * + * To prevent a kexec kernel from onlining secondary CPUs invalidate the + * mailbox address in the ACPI MADT wakeup structure which prevents a + * kexec kernel to use it. + * + * This is safe as the booting kernel has the mailbox address cached + * already and acpi_wakeup_cpu() uses the cached value to bring up the + * secondary CPUs. + * + * Note: This is a Linux specific convention and not covered by the + * ACPI specification. + */ + mp_wake->mailbox_address = 0; +} + +int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + + /* + * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake + * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger + * than the actual size of the MP wakeup entry in ACPI table because the + * 'reset_vector' is only available in the V1 MP wakeup structure. + */ + if (!mp_wake) + return -EINVAL; + if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) + return -EINVAL; + if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; + + if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && + mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { + if (acpi_mp_setup_reset(mp_wake->reset_vector)) { + pr_warn("Failed to setup MADT reset vector\n"); + acpi_mp_disable_offlining(mp_wake); + } + } else { + /* + * CPU offlining requires version 1 of the ACPI MADT wakeup + * structure. + */ + acpi_mp_disable_offlining(mp_wake); + } + + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); + + return 0; +} diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index d5d8a352eafa..b200a193beeb 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,7 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) - movq saved_magic, %rax + movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax je 2f @@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64) movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_rsp, %rsp + movq saved_rsp(%rip), %rsp - movq saved_rbx, %rbx - movq saved_rdi, %rdi - movq saved_rsi, %rsi - movq saved_rbp, %rbp + movq saved_rbx(%rip), %rbx + movq saved_rdi(%rip), %rdi + movq saved_rsi(%rip), %rsi + movq saved_rbp(%rip), %rbp - movq saved_rip, %rax + movq saved_rip(%rip), %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax SYM_FUNC_END(wakeup_long64) @@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel) movq $.Lresume_point, saved_rip(%rip) - movq %rsp, saved_rsp - movq %rbp, saved_rbp - movq %rbx, saved_rbx - movq %rdi, saved_rdi - movq %rsi, saved_rsi + movq %rsp, saved_rsp(%rip) + movq %rbp, saved_rbp(%rip) + movq %rbx, saved_rbx(%rip) + movq %rdi, saved_rdi(%rip) + movq %rsi, saved_rsi(%rip) addq $8, %rsp movl $3, %edi @@ -87,6 +87,7 @@ SYM_FUNC_START(do_suspend_lowlevel) .align 4 .Lresume_point: + ANNOTATE_NOENDBR /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index aae7456ece07..c71b575bf229 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -30,6 +30,7 @@ #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> +#include <asm/cfi.h> int __read_mostly alternatives_patched; @@ -44,7 +45,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define DA_ENDBR 0x08 #define DA_SMP 0x10 -static unsigned int __initdata_or_module debug_alternative; +static unsigned int debug_alternative; static int __init debug_alt(char *str) { @@ -124,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = }; /* + * Nomenclature for variable names to simplify and clarify this code and ease + * any potential staring at it: + * + * @instr: source address of the original instructions in the kernel text as + * generated by the compiler. + * + * @buf: temporary buffer on which the patching operates. This buffer is + * eventually text-poked into the kernel image. + * + * @replacement/@repl: pointer to the opcodes which are replacing @instr, located + * in the .altinstr_replacement section. + */ + +/* * Fill the buffer with a single effective instruction of size @len. * * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) @@ -132,35 +147,34 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ -static void __init_or_module add_nop(u8 *instr, unsigned int len) +static void add_nop(u8 *buf, unsigned int len) { - u8 *target = instr + len; + u8 *target = buf + len; if (!len) return; if (len <= ASM_NOP_MAX) { - memcpy(instr, x86_nops[len], len); + memcpy(buf, x86_nops[len], len); return; } if (len < 128) { - __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); - instr += JMP8_INSN_SIZE; + __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); + buf += JMP8_INSN_SIZE; } else { - __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); - instr += JMP32_INSN_SIZE; + __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); + buf += JMP32_INSN_SIZE; } - for (;instr < target; instr++) - *instr = INT3_INSN_OPCODE; + for (;buf < target; buf++) + *buf = INT3_INSN_OPCODE; } extern s32 __retpoline_sites[], __retpoline_sites_end[]; extern s32 __return_sites[], __return_sites_end[]; extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void text_poke_early(void *addr, const void *opcode, size_t len); @@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn) * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ -static int skip_nops(u8 *instr, int offset, int len) +static int skip_nops(u8 *buf, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { - if (insn_decode_kernel(&insn, &instr[offset])) + if (insn_decode_kernel(&insn, &buf[offset])) break; if (!insn_is_nop(&insn)) @@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len) } /* - * Optimize a sequence of NOPs, possibly preceded by an unconditional jump - * to the end of the NOP sequence into a single NOP. - */ -static bool __init_or_module -__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) -{ - int i = *next - insn->length; - - switch (insn->opcode.bytes[0]) { - case JMP8_INSN_OPCODE: - case JMP32_INSN_OPCODE: - *prev = i; - *target = *next + insn->immediate.value; - return false; - } - - if (insn_is_nop(insn)) { - int nop = i; - - *next = skip_nops(instr, *next, len); - if (*target && *next == *target) - nop = *prev; - - add_nop(instr + nop, *next - nop); - DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); - return true; - } - - *target = 0; - return false; -} - -/* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { - int prev, target = 0; - for (int next, i = 0; i < len; i = next) { struct insn insn; - if (insn_decode_kernel(&insn, &instr[i])) + if (insn_decode_kernel(&insn, &buf[i])) return; next = i + insn.length; - __optimize_nops(instr, len, &insn, &next, &prev, &target); - } -} + if (insn_is_nop(&insn)) { + int nop = i; -static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) -{ - unsigned long flags; + /* Has the NOP already been optimized? */ + if (i + insn.length == len) + return; - local_irq_save(flags); - optimize_nops(instr, len); - sync_core(); - local_irq_restore(flags); + next = skip_nops(buf, next, len); + + add_nop(buf + nop, next - nop); + DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); + } + } } /* @@ -335,12 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -static void __init_or_module noinline -apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { - int prev, target = 0; - - for (int next, i = 0; i < len; i = next) { + for (int next, i = 0; i < instrlen; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) @@ -348,9 +325,6 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) next = i + insn.length; - if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) - continue; - switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || @@ -362,10 +336,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: - if (need_reloc(next + insn.immediate.value, src, src_len)) { + if (need_reloc(next + insn.immediate.value, repl, repl_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), - src - dest); + repl - instr); } /* @@ -373,7 +347,7 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; - imm += src - dest; + imm += repl - instr; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; @@ -386,15 +360,85 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) } if (insn_rip_relative(&insn)) { - if (need_reloc(next + insn.displacement.value, src, src_len)) { + if (need_reloc(next + insn.displacement.value, repl, repl_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), - src - dest); + repl - instr); } } } } +void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +{ + __apply_relocation(buf, instr, instrlen, repl, repl_len); + optimize_nops(instr, buf, instrlen); +} + +/* Low-level backend functions usable from alternative code replacements. */ +DEFINE_ASM_FUNC(nop_func, "", .entry.text); +EXPORT_SYMBOL_GPL(nop_func); + +noinstr void BUG_func(void) +{ + BUG(); +} +EXPORT_SYMBOL(BUG_func); + +#define CALL_RIP_REL_OPCODE 0xff +#define CALL_RIP_REL_MODRM 0x15 + +/* + * Rewrite the "call BUG_func" replacement to point to the target of the + * indirect pv_ops call "call *disp(%ip)". + */ +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, + struct module *mod) +{ + u8 *wr_instr = module_writable_address(mod, instr); + void *target, *bug = &BUG_func; + s32 disp; + + if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { + pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); + BUG(); + } + + if (a->instrlen != 6 || + wr_instr[0] != CALL_RIP_REL_OPCODE || + wr_instr[1] != CALL_RIP_REL_MODRM) { + pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); + BUG(); + } + + /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ + disp = *(s32 *)(wr_instr + 2); +#ifdef CONFIG_X86_64 + /* ff 15 00 00 00 00 call *0x0(%rip) */ + /* target address is stored at "next instruction + disp". */ + target = *(void **)(instr + a->instrlen + disp); +#else + /* ff 15 00 00 00 00 call *0x0 */ + /* target address is stored at disp. */ + target = *(void **)disp; +#endif + if (!target) + target = bug; + + /* (BUG_func - .) + (target - BUG_func) := target - . */ + *(s32 *)(insn_buff + 1) += target - bug; + + if (target == &nop_func) + return 0; + + return 5; +} + +static inline u8 * instr_va(struct alt_instr *i) +{ + return (u8 *)&i->instr_offset + i->instr_offset; +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -406,11 +450,12 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end) + struct alt_instr *end, + struct module *mod) { - struct alt_instr *a; - u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; + u8 *instr, *replacement; + struct alt_instr *a, *b; DPRINTK(ALT, "alt table %px, -> %px", start, end); @@ -435,9 +480,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + u8 *wr_instr, *wr_replacement; + + /* + * In case of nested ALTERNATIVE()s the outer alternative might + * add more padding. To ensure consistent patching find the max + * padding for all alt_instr entries for this site (nested + * alternatives result in consecutive entries). + */ + for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { + u8 len = max(a->instrlen, b->instrlen); + a->instrlen = b->instrlen = len; + } + + instr = instr_va(a); + wr_instr = module_writable_address(mod, instr); - instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; + wr_replacement = module_writable_address(mod, replacement); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -448,30 +509,38 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - optimize_nops_inplace(instr, a->instrlen); + memcpy(insn_buff, wr_instr, a->instrlen); + optimize_nops(instr, insn_buff, a->instrlen); + text_poke_early(wr_instr, insn_buff, a->instrlen); continue; } - DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", - (a->flags & ALT_FLAG_NOT) ? "!" : "", + DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", a->cpuid >> 5, a->cpuid & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen); + replacement, a->replacementlen, a->flags); - memcpy(insn_buff, replacement, a->replacementlen); + memcpy(insn_buff, wr_replacement, a->replacementlen); insn_buff_sz = a->replacementlen; + if (a->flags & ALT_FLAG_DIRECT_CALL) { + insn_buff_sz = alt_replace_call(instr, insn_buff, a, + mod); + if (insn_buff_sz < 0) + continue; + } + for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); + apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insn_buff, insn_buff_sz); + text_poke_early(wr_instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -483,7 +552,7 @@ static inline bool is_jcc32(struct insn *insn) return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -647,8 +716,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * The compiler is supposed to EMIT an INT3 after every unconditional * JMP instruction due to AMD BTC. However, if the compiler is too old - * or SLS isn't enabled, we still need an INT3 after indirect JMPs - * even on Intel. + * or MITIGATION_SLS isn't enabled, we still need an INT3 after + * indirect JMPs even on Intel. */ if (op == JMP32_INSN_OPCODE && i < insn->length) bytes[i++] = INT3_INSN_OPCODE; @@ -662,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -700,15 +771,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { - optimize_nops(bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + optimize_nops(addr, bytes, len); + DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK /* * Rewrite the compiler generated return thunk tail-calls. @@ -739,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { s32 *s; @@ -748,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -773,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETHUNK */ +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } +#endif /* CONFIG_MITIGATION_RETHUNK */ -#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ +#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } -#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr); +static void poison_cfi(void *addr, void *wr_addr); -static void __init_or_module poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) return; if (!is_endbr(endbr)) { @@ -813,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + text_poke_early(wr_addr, &poison, 4); } /* @@ -822,35 +898,103 @@ static void __init_or_module poison_endbr(void *addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, true); + poison_endbr(addr, wr_addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - 16, wr_addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_X86_KERNEL_IBT */ -#ifdef CONFIG_FINEIBT +#ifdef CONFIG_CFI_AUTO_DEFAULT +#define __CFI_DEFAULT CFI_AUTO +#elif defined(CONFIG_CFI_CLANG) +#define __CFI_DEFAULT CFI_KCFI +#else +#define __CFI_DEFAULT CFI_OFF +#endif -enum cfi_mode { - CFI_DEFAULT, - CFI_OFF, - CFI_KCFI, - CFI_FINEIBT, -}; +enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; + +#ifdef CONFIG_CFI_CLANG +struct bpf_insn; + +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ +extern unsigned int __bpf_prog_runX(const void *ctx, + const struct bpf_insn *insn); + +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +__ADDRESSABLE(__bpf_prog_runX); + +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_hash,@object \n" +" .globl cfi_bpf_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_hash: \n" +" .long __kcfi_typeid___bpf_prog_runX \n" +" .size cfi_bpf_hash, 4 \n" +" .popsection \n" +); + +/* Must match bpf_callback_t */ +extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); + +__ADDRESSABLE(__bpf_callback_fn); + +/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_subprog_hash,@object \n" +" .globl cfi_bpf_subprog_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_subprog_hash: \n" +" .long __kcfi_typeid___bpf_callback_fn \n" +" .size cfi_bpf_subprog_hash, 4 \n" +" .popsection \n" +); + +u32 cfi_get_func_hash(void *func) +{ + u32 hash; + + func -= cfi_get_offset(); + switch (cfi_mode) { + case CFI_FINEIBT: + func += 7; + break; + case CFI_KCFI: + func += 1; + break; + default: + return 0; + } + + if (get_kernel_nofault(hash, func)) + return 0; + + return hash; +} +#endif + +#ifdef CONFIG_FINEIBT -static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; @@ -883,7 +1027,7 @@ static __init int cfi_parse_cmdline(char *str) } if (!strcmp(str, "auto")) { - cfi_mode = CFI_DEFAULT; + cfi_mode = CFI_AUTO; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; @@ -992,7 +1136,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end) +static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1004,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); + if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, jmp, 2); + text_poke_early(wr_addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end) +static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1027,106 +1174,115 @@ static int cfi_enable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, mov, 2); + text_poke_early(wr_addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end) +static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(addr + 1, &hash, 4); + text_poke_early(wr_addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end) +static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end) +static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr+16, false); + poison_endbr(addr + 16, wr_addr + 16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end) +static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(addr + 2, &hash, 4); + text_poke_early(wr_addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end) +static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { - text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(addr + fineibt_caller_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1135,15 +1291,16 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { + bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; - if (cfi_mode == CFI_DEFAULT) { + if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) cfi_mode = CFI_FINEIBT; @@ -1154,19 +1311,22 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); + ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; if (cfi_rand) { - if (builtin) + if (builtin) { cfi_seed = get_random_u32(); + cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); + cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); + } - ret = cfi_rand_preamble(start_cfi, end_cfi); + ret = cfi_rand_preamble(start_cfi, end_cfi, mod); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline); + ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; } @@ -1178,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline); + ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1188,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi); + ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi); + cfi_rewrite_endbr(start_cfi, end_cfi, mod); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1217,7 +1377,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr) +static void poison_cfi(void *addr, void *wr_addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1229,8 +1389,8 @@ static void poison_cfi(void *addr) * ud2 * 1: nop */ - poison_endbr(addr, false); - poison_hash(addr + fineibt_preamble_hash); + poison_endbr(addr, wr_addr, false); + poison_hash(wr_addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1239,7 +1399,7 @@ static void poison_cfi(void *addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(addr + 1); + poison_hash(wr_addr + 1); break; default: @@ -1250,22 +1410,21 @@ static void poison_cfi(void *addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr) { } +static void poison_cfi(void *addr, void *wr_addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, - /* .builtin = */ false); + start_cfi, end_cfi, mod); } #ifdef CONFIG_SMP @@ -1421,46 +1580,6 @@ int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#ifdef CONFIG_PARAVIRT - -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) -{ - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; - } -} - -void __init_or_module apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{ - struct paravirt_patch_site *p; - char insn_buff[MAX_PATCH_LEN]; - - for (p = start; p < end; p++) { - unsigned int used; - - BUG_ON(p->len > MAX_PATCH_LEN); - /* prep the buffer with the original instructions */ - memcpy(insn_buff, p->instr, p->len); - used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); - - BUG_ON(used > p->len); - - /* Pad the rest with nops */ - add_nops(insn_buff + used, p->len - used); - text_poke_early(p->instr, insn_buff, p->len); - } -} -extern struct paravirt_patch_site __start_parainstructions[], - __stop_parainstructions[]; -#endif /* CONFIG_PARAVIRT */ - /* * Self-test for the INT3 based CALL emulation code. * @@ -1567,7 +1686,7 @@ static noinline void __init alt_reloc_selftest(void) */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) - : /* output */ + : ASM_CALL_CONSTRAINT : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); @@ -1596,43 +1715,22 @@ void __init alternative_instructions(void) */ /* - * Paravirt patching and alternative patching can be combined to - * replace a function call with a short direct code sequence (e.g. - * by setting a constant return value instead of doing that in an - * external function). - * In order to make this work the following sequence is required: - * 1. set (artificial) features depending on used paravirt - * functions which can later influence alternative patching - * 2. apply paravirt patching (generally replacing an indirect - * function call with a direct one) - * 3. apply alternative patching (e.g. replacing a direct function - * call with a custom code sequence) - * Doing paravirt patching after alternative patching would clobber - * the optimization of the custom code with a function call again. + * Make sure to set (artificial) features depending on used paravirt + * functions which can later influence alternative patching. */ paravirt_set_cap(); - /* - * First patch paravirt functions, such that we overwrite the indirect - * call with the direct call. - */ - apply_paravirt(__parainstructions, __parainstructions_end); - __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, true); + __cfi_sites, __cfi_sites_end, NULL); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end); - apply_returns(__return_sites, __return_sites_end); + apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); + apply_returns(__return_sites, __return_sites_end, NULL); - /* - * Then patch alternatives, such that those paravirt calls that are in - * alternatives can be overwritten by their immediate fragments. - */ - apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); /* * Now all calls are established. Apply the call thunks if @@ -1643,7 +1741,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ @@ -1734,7 +1832,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) * restoring the previous mm. */ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(smp_processor_id()); + leave_mm(); temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); switch_mm_irqs_off(NULL, mm, current); @@ -1756,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) return temp_state; } +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + static inline void unuse_temporary_mm(temp_mm_state_t prev_state) { lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ + cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); + /* * Restore the breakpoints if they were disabled before the temporary mm * was loaded. @@ -1769,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) hw_breakpoint_restore(); } -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - static void text_poke_memcpy(void *dst, const void *src, size_t len) { memcpy(dst, src, len); @@ -1906,7 +2008,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching - * trough a mutex. + * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 56a917df410d..c884deca839b 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, - .alloc_pages = dma_direct_alloc_pages, + .alloc_pages_op = dma_direct_alloc_pages, .free_pages = dma_direct_free_pages, }; @@ -776,7 +776,7 @@ int __init gart_iommu_init(void) iommu_size >> PAGE_SHIFT); /* * Tricky. The GART table remaps the physical memory range, - * so the CPU wont notice potential aliases and if the memory + * so the CPU won't notice potential aliases and if the memory * is remapped to UC later on, we might surprise the PCI devices * with a stray writeout of a cacheline. So play it sure and * do an explicit, full-scale wbinvd() _after_ having marked all diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 053f6dcc6b2c..67e773744edb 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,61 +15,8 @@ #include <linux/pci_ids.h> #include <asm/amd_nb.h> -#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 -#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 -#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 -#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 -#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 -#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 -#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 -#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a -#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 -#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb -#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 - -#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 -#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 -#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 -#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 -#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc -#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 -#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 -#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c - -/* Protect the PCI config register pairs used for SMN. */ -static DEFINE_MUTEX(smn_mutex); - static u32 *flush_words; -static const struct pci_device_id amd_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, - {} -}; - -#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 - static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -79,65 +26,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, - {} -}; - -static const struct pci_device_id amd_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, - {} -}; - -static const struct pci_device_id hygon_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, - {} -}; - -static const struct pci_device_id hygon_nb_misc_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - {} -}; - -static const struct pci_device_id hygon_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -168,135 +56,37 @@ struct amd_northbridge *node_to_amd_nb(int node) } EXPORT_SYMBOL_GPL(node_to_amd_nb); -static struct pci_dev *next_northbridge(struct pci_dev *dev, - const struct pci_device_id *ids) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(ids, dev)); - return dev; -} - -static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) -{ - struct pci_dev *root; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - root = node_to_amd_nb(node)->root; - if (!root) - goto out; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(root, 0x60, address); - if (err) { - pr_warn("Error programming SMN address 0x%x.\n", address); - goto out_unlock; - } - - err = (write ? pci_write_config_dword(root, 0x64, *value) - : pci_read_config_dword(root, 0x64, value)); - if (err) - pr_warn("Error %s SMN address 0x%x.\n", - (write ? "writing to" : "reading from"), address); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} - -int amd_smn_read(u16 node, u32 address, u32 *value) -{ - return __amd_smn_rw(node, address, value, false); -} -EXPORT_SYMBOL_GPL(amd_smn_read); - -int amd_smn_write(u16 node, u32 address, u32 value) -{ - return __amd_smn_rw(node, address, &value, true); -} -EXPORT_SYMBOL_GPL(amd_smn_write); - - static int amd_cache_northbridges(void) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; - const struct pci_device_id *link_ids = amd_nb_link_ids; - const struct pci_device_id *root_ids = amd_root_ids; - struct pci_dev *root, *misc, *link; struct amd_northbridge *nb; - u16 roots_per_misc = 0; - u16 misc_count = 0; - u16 root_count = 0; - u16 i, j; + u16 i; if (amd_northbridges.num) return 0; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - root_ids = hygon_root_ids; - misc_ids = hygon_nb_misc_ids; - link_ids = hygon_nb_link_ids; - } - - misc = NULL; - while ((misc = next_northbridge(misc, misc_ids))) - misc_count++; - - if (!misc_count) - return -ENODEV; + amd_northbridges.num = amd_num_nodes(); - root = NULL; - while ((root = next_northbridge(root, root_ids))) - root_count++; - - if (root_count) { - roots_per_misc = root_count / misc_count; - - /* - * There should be _exactly_ N roots for each DF/SMN - * interface. - */ - if (!roots_per_misc || (root_count % roots_per_misc)) { - pr_info("Unsupported AMD DF/PCI configuration found\n"); - return -ENODEV; - } - } - - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; - amd_northbridges.num = misc_count; - link = misc = root = NULL; for (i = 0; i < amd_northbridges.num; i++) { - node_to_amd_nb(i)->root = root = - next_northbridge(root, root_ids); - node_to_amd_nb(i)->misc = misc = - next_northbridge(misc, misc_ids); - node_to_amd_nb(i)->link = link = - next_northbridge(link, link_ids); + node_to_amd_nb(i)->root = amd_node_get_root(i); + node_to_amd_nb(i)->misc = amd_node_get_func(i, 3); /* - * If there are more PCI root devices than data fabric/ - * system management network interfaces, then the (N) - * PCI roots per DF/SMN interface are functionally the - * same (for DF/SMN access) and N-1 are redundant. N-1 - * PCI roots should be skipped per DF/SMN interface so - * the following DF/SMN interfaces get mapped to - * correct PCI roots. + * Each Northbridge must have a 'misc' device. + * If not, then uninitialize everything. */ - for (j = 1; j < roots_per_misc; j++) - root = next_northbridge(root, root_ids); + if (!node_to_amd_nb(i)->misc) { + amd_northbridges.num = 0; + kfree(nb); + return -ENODEV; + } + + node_to_amd_nb(i)->link = amd_node_get_func(i, 4); } if (amd_gart_present()) @@ -334,7 +124,6 @@ static int amd_cache_northbridges(void) */ bool __init early_is_amd_nb(u32 device) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *id; u32 vendor = device & 0xffff; @@ -342,11 +131,11 @@ bool __init early_is_amd_nb(u32 device) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return false; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - misc_ids = hygon_nb_misc_ids; + if (cpu_feature_enabled(X86_FEATURE_ZEN)) + return false; device >>= 16; - for (id = misc_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return true; return false; @@ -354,7 +143,6 @@ bool __init early_is_amd_nb(u32 device) struct resource *amd_get_mmconfig_range(struct resource *res) { - u32 address; u64 base, msr; unsigned int segn_busn_bits; @@ -362,13 +150,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return NULL; - /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */ + if (boot_cpu_data.x86 < 0x10 || + rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) return NULL; - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - /* mmconfig is not enabled */ if (!(msr & FAM10H_MMIO_CONF_ENABLE)) return NULL; @@ -386,7 +172,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) int amd_get_subcaches(int cpu) { - struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link; + struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link; unsigned int mask; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) @@ -400,7 +186,7 @@ int amd_get_subcaches(int cpu) int amd_set_subcaches(int cpu, unsigned long mask) { static unsigned int reset, ban; - struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu)); + struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu)); unsigned int reg; int cuid; @@ -531,6 +317,10 @@ static __init void fix_erratum_688(void) static __init int init_amd_nbs(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return 0; + amd_cache_northbridges(); amd_cache_gart(); diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c new file mode 100644 index 000000000000..d2ec7fd555c5 --- /dev/null +++ b/arch/x86/kernel/amd_node.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Node helper functions and common defines + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> + */ + +#include <asm/amd_node.h> + +/* + * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one + * or more nodes per package. + * + * The nodes are software-visible through PCI config space. All nodes are enumerated + * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8 + * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a + * multi-function device. + * + * On legacy systems, these node devices represent integrated Northbridge functionality. + * On Zen-based systems, these node devices represent Data Fabric functionality. + * + * See "Configuration Space Accesses" section in BKDGs or + * "Processor x86 Core" -> "Configuration Space" section in PPRs. + */ +struct pci_dev *amd_node_get_func(u16 node, u8 func) +{ + if (node >= MAX_AMD_NUM_NODES) + return NULL; + + return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); +} + +#define DF_BLK_INST_CNT 0x040 +#define DF_CFG_ADDR_CNTL_LEGACY 0x084 +#define DF_CFG_ADDR_CNTL_DF4 0xC04 + +#define DF_MAJOR_REVISION GENMASK(27, 24) + +static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0) +{ + u32 reg; + + /* + * Revision fields added for DF4 and later. + * + * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. + */ + if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®)) + return 0; + + if (reg & DF_MAJOR_REVISION) + return DF_CFG_ADDR_CNTL_DF4; + + return DF_CFG_ADDR_CNTL_LEGACY; +} + +struct pci_dev *amd_node_get_root(u16 node) +{ + struct pci_dev *root; + u16 cntl_off; + u8 bus; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return NULL; + + /* + * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl) + * Bits [7:0] (SecBusNum) holds the bus number of the root device for + * this Data Fabric instance. The segment, device, and function will be 0. + */ + struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0); + if (!df_f0) + return NULL; + + cntl_off = get_cfg_addr_cntl_offset(df_f0); + if (!cntl_off) + return NULL; + + if (pci_read_config_byte(df_f0, cntl_off, &bus)) + return NULL; + + /* Grab the pointer for the actual root device instance. */ + root = pci_get_domain_bus_and_slot(0, bus, 0); + + pci_dbg(root, "is root for AMD node %u\n", node); + return root; +} + +static struct pci_dev **amd_roots; + +/* Protect the PCI config register pairs used for SMN. */ +static DEFINE_MUTEX(smn_mutex); + +#define SMN_INDEX_OFFSET 0x60 +#define SMN_DATA_OFFSET 0x64 + +/* + * SMN accesses may fail in ways that are difficult to detect here in the called + * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do + * their own checking based on what behavior they expect. + * + * For SMN reads, the returned value may be zero if the register is Read-as-Zero. + * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response" + * can be checked here, and a proper error code can be returned. + * + * But the Read-as-Zero response cannot be verified here. A value of 0 may be + * correct in some cases, so callers must check that this correct is for the + * register/fields they need. + * + * For SMN writes, success can be determined through a "write and read back" + * However, this is not robust when done here. + * + * Possible issues: + * + * 1) Bits that are "Write-1-to-Clear". In this case, the read value should + * *not* match the write value. + * + * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be + * known here. + * + * 3) Bits that are "Reserved / Set to 1". Ditto above. + * + * Callers of amd_smn_write() should do the "write and read back" check + * themselves, if needed. + * + * For #1, they can see if their target bits got cleared. + * + * For #2 and #3, they can check if their target bits got set as intended. + * + * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then + * the operation is considered a success, and the caller does their own + * checking. + */ +static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_num_nodes()) + return err; + + root = amd_roots[node]; + if (!root) + return err; + + guard(mutex)(&smn_mutex); + + err = pci_write_config_dword(root, i_off, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + return pcibios_err_to_errno(err); + } + + err = (write ? pci_write_config_dword(root, d_off, *value) + : pci_read_config_dword(root, d_off, value)); + + return pcibios_err_to_errno(err); +} + +int __must_check amd_smn_read(u16 node, u32 address, u32 *value) +{ + int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int __must_check amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +static int amd_cache_roots(void) +{ + u16 node, num_nodes = amd_num_nodes(); + + amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); + if (!amd_roots) + return -ENOMEM; + + for (node = 0; node < num_nodes; node++) + amd_roots[node] = amd_node_get_root(node); + + return 0; +} + +static int __init amd_smn_init(void) +{ + int err; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + guard(mutex)(&smn_mutex); + + if (amd_roots) + return 0; + + err = amd_cache_roots(); + if (err) + return err; + + return 0; +} + +fs_initcall(amd_smn_init); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 4feaa670d578..89c0c8a3fc7e 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -259,10 +259,9 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) order); } - /* No multi-function device? */ type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE); - if (!(type & 0x80)) + if (!(type & PCI_HEADER_TYPE_MFD)) break; } } diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 2ee867d796d9..3bf0487cf3b7 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -4,7 +4,7 @@ # # Leads to non-deterministic coverage that is not a function of syscall inputs. -# In particualr, smp_apic_timer_interrupt() is called in random places. +# In particular, smp_apic_timer_interrupt() is called in random places. KCOV_INSTRUMENT := n obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 41093cf20acd..e893dc6f11c1 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -19,6 +19,7 @@ #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/acpi_pmtmr.h> +#include <linux/bitmap.h> #include <linux/clockchips.h> #include <linux/interrupt.h> #include <linux/memblock.h> @@ -67,10 +68,6 @@ #include "local.h" -unsigned int num_processors; - -unsigned disabled_cpus; - /* Processor that is doing the boot up */ u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); @@ -78,18 +75,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); u8 boot_cpu_apic_version __ro_after_init; /* - * Bitmask of physically existing CPUs: - */ -physid_mask_t phys_cpu_present_map; - -/* - * Processor to be disabled specified by kernel parameter - * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to - * avoid undefined behaviour caused by sending INIT from AP to BSP. - */ -static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID; - -/* * This variable controls which CPUs receive external NMIs. By default, * external NMIs are delivered only to the BSP. */ @@ -108,14 +93,6 @@ static inline bool apic_accessible(void) return x2apic_mode || apic_mmio_base; } -/* - * Map cpu index to physical APIC ID - */ -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); -DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); -EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); - #ifdef CONFIG_X86_32 /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase __ro_after_init; @@ -261,16 +238,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ @@ -473,7 +440,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); + + /* + * Setting APIC_LVT_MASKED (above) should be enough to tell + * the hardware that this timer will never fire. But AMD + * erratum 411 and some Intel CPU behavior circa 2024 say + * otherwise. Time for belt and suspenders programming: mask + * the timer _and_ zero the counter registers: + */ + if (v & APIC_LVT_TIMER_TSCDEADLINE) + wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + else + apic_write(APIC_TMICT, 0); + return 0; } @@ -530,32 +509,32 @@ static struct clock_event_device lapic_clockevent = { static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */ - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), + X86_MATCH_VFM(INTEL_HASWELL, 0x22), + X86_MATCH_VFM(INTEL_HASWELL_L, 0x20), + X86_MATCH_VFM(INTEL_HASWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), + X86_MATCH_VFM(INTEL_BROADWELL, 0x25), + X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE, 0x52), {}, }; @@ -664,7 +643,7 @@ void lapic_update_tsc_freq(void) static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata u32 lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* @@ -674,7 +653,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) { unsigned long long tsc = 0; long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); + u32 pm = acpi_pm_read_early(); if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); @@ -699,7 +678,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } static int __init -calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) +calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc) { const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; const long pm_thresh = pm_100ms / 100; @@ -710,7 +689,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return -1; #endif - apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); + apic_pr_verbose("... PM-Timer delta = %u\n", deltapm); /* Check, if the PM timer is available */ if (!deltapm) @@ -720,14 +699,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) if (deltapm > (pm_100ms - pm_thresh) && deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + apic_pr_verbose("... PM-Timer result ok\n"); return 0; } res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warn("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n", (long)res); + pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n", + (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -740,9 +719,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); - apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld)\n", - (unsigned long)res, *deltatsc); + apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -782,7 +760,7 @@ bool __init apic_needs_pit(void) /* * If interrupt delivery mode is legacy PIC or virtual wire without - * configuration, the local APIC timer wont be set up. Make sure + * configuration, the local APIC timer won't be set up. Make sure * that the PIT is initialized. */ if (apic_intr_mode == APIC_PIC || @@ -825,8 +803,7 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ if (!lapic_init_clockevent()) { - apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_period); + apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period); /* * Direct calibration methods must have an always running * local APIC timer, no need for broadcast timer. @@ -835,8 +812,7 @@ static int __init calibrate_APIC_clock(void) return 0; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); + apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n"); /* * There are platforms w/o global clockevent devices. Instead of @@ -899,7 +875,7 @@ static int __init calibrate_APIC_clock(void) /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + apic_pr_verbose("... lapic delta = %ld\n", delta); deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); @@ -910,22 +886,19 @@ static int __init calibrate_APIC_clock(void) lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; lapic_init_clockevent(); - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - lapic_timer_period); + apic_pr_verbose("..... delta %ld\n", delta); + apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult); + apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); } - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - lapic_timer_period / (1000000 / HZ), - lapic_timer_period % (1000000 / HZ)); + apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n", + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result @@ -944,7 +917,7 @@ static int __init calibrate_APIC_clock(void) * available. */ if (!pm_referenced && global_clock_event) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + apic_pr_verbose("... verify APIC timer\n"); /* * Setup the apic timer manually @@ -965,11 +938,11 @@ static int __init calibrate_APIC_clock(void) /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + apic_pr_verbose("... jiffies delta = %lu\n", deltaj); /* Check, if the jiffies result is consistent */ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + apic_pr_verbose("... jiffies result ok\n"); else levt->features |= CLOCK_EVT_FEAT_DUMMY; } @@ -1254,9 +1227,8 @@ void __init sync_Arb_IDs(void) */ apic_wait_icr_idle(); - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_pr_debug("Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } enum apic_intr_mode_id apic_intr_mode __ro_after_init; @@ -1442,10 +1414,10 @@ static void lapic_setup_esr(void) if (maxlvt > 3) apic_write(APIC_ESR, 0); value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08x after: 0x%08x\n", - oldvalue, value); + if (value != oldvalue) { + apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n", + oldvalue, value); + } } #define APIC_IR_REGS APIC_ISR_NR @@ -1549,9 +1521,6 @@ static void setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - /* Validate that the APIC is registered if required */ - BUG_ON(apic->apic_id_registered && !apic->apic_id_registered()); - /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel @@ -1635,10 +1604,10 @@ static void setup_local_APIC(void) value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu); } apic_write(APIC_LVT0, value); @@ -1690,8 +1659,6 @@ void apic_ap_setup(void) end_local_APIC_setup(); } -static __init void cpu_set_boot_apic(void); - static __init void apic_read_boot_cpu_id(bool x2apic) { /* @@ -1706,7 +1673,8 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_physical_apicid = read_apic_id(); boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } - cpu_set_boot_apic(); + topology_register_boot_apic(boot_cpu_physical_apicid); + x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC @@ -1724,11 +1692,11 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } @@ -1808,16 +1776,13 @@ void x2apic_setup(void) __x2apic_enable(); } -static __init void apic_set_fixmap(void); +static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; - - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; + u32 x2apic_id; - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1830,7 +1795,16 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - apic_set_fixmap(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + /* + * Don't reread the APIC ID as it was already done from + * check_x2apic() and the APIC driver still is a x2APIC variant, + * which fails to do the read after x2APIC was disabled. + */ + apic_set_fixmap(false); } static __init void x2apic_enable(void) @@ -2091,17 +2065,16 @@ void __init init_apic_mappings(void) pr_info("APIC: disable apic facility\n"); apic_disable(); } - num_processors = 1; } } -static __init void apic_set_fixmap(void) +static __init void apic_set_fixmap(bool read_apic) { set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - apic_mmio_base, mp_lapic_addr); - apic_read_boot_cpu_id(false); + apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr); + if (read_apic) + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) @@ -2111,7 +2084,7 @@ void __init register_lapic_address(unsigned long address) mp_lapic_addr = address; if (!x2apic_mode) - apic_set_fixmap(); + apic_set_fixmap(true); } /* @@ -2202,18 +2175,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) apic_eoi(); atomic_inc(&irq_err_count); - apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", - smp_processor_id(), v); + apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v); v &= 0xff; while (v) { if (v & 0x1) - apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + apic_pr_debug_cont(" : %s", error_interrupt_reason[i]); i++; v >>= 1; } - apic_printk(APIC_DEBUG, KERN_CONT "\n"); + apic_pr_debug_cont("\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); } @@ -2233,8 +2205,7 @@ static void __init connect_bsp_APIC(void) * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's * local APIC to INT and NMI lines. */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); + apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n"); imcr_pic_to_apic(); } #endif @@ -2259,8 +2230,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) * IPIs, won't work beyond this point! The only exception are * INIT IPIs. */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); + apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n"); imcr_apic_to_pic(); return; } @@ -2305,155 +2275,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -/* - * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated - * contiguously, it equals to current allocated max logical CPU ID plus 1. - * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range, - * so the maximum of nr_logical_cpuids is nr_cpu_ids. - * - * NOTE: Reserve 0 for BSP. - */ -static int nr_logical_cpuids = 1; - -/* - * Used to store mapping between logical CPU IDs and APIC IDs. - */ -u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, }; - -bool arch_match_cpu_phys_id(int cpu, u64 phys_id) -{ - return phys_id == (u64)cpuid_to_apicid[cpu]; -} - -#ifdef CONFIG_SMP -static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) -{ - /* Isolate the SMT bit(s) in the APICID and check for 0 */ - u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1; - - if (smp_num_siblings == 1 || !(apicid & mask)) - cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); -} - -/* - * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid - * during early boot. Initialize the primary thread mask before SMP - * bringup. - */ -static int __init smp_init_primary_thread_mask(void) -{ - unsigned int cpu; - - /* - * XEN/PV provides either none or useless topology information. - * Pretend that all vCPUs are primary threads. - */ - if (xen_pv_domain()) { - cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask); - return 0; - } - - for (cpu = 0; cpu < nr_logical_cpuids; cpu++) - cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]); - return 0; -} -early_initcall(smp_init_primary_thread_mask); -#else -static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } -#endif - -/* - * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids - * and cpuid_to_apicid[] synchronized. - */ -static int allocate_logical_cpuid(int apicid) -{ - int i; - - /* - * cpuid <-> apicid mapping is persistent, so when a cpu is up, - * check if the kernel has allocated a cpuid for it. - */ - for (i = 0; i < nr_logical_cpuids; i++) { - if (cpuid_to_apicid[i] == apicid) - return i; - } - - /* Allocate a new cpuid. */ - if (nr_logical_cpuids >= nr_cpu_ids) { - WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. " - "Processor %d/0x%x and the rest are ignored.\n", - nr_cpu_ids, nr_logical_cpuids, apicid); - return -EINVAL; - } - - cpuid_to_apicid[nr_logical_cpuids] = apicid; - return nr_logical_cpuids++; -} - -static void cpu_update_apic(int cpu, u32 apicid) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) - early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; -#endif - set_cpu_possible(cpu, true); - physid_set(apicid, phys_cpu_present_map); - set_cpu_present(cpu, true); - num_processors++; - - if (system_state != SYSTEM_BOOTING) - cpu_mark_primary_thread(cpu, apicid); -} - -static __init void cpu_set_boot_apic(void) -{ - cpuid_to_apicid[0] = boot_cpu_physical_apicid; - cpu_update_apic(0, boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); -} - -int generic_processor_info(int apicid) -{ - int cpu, max = nr_cpu_ids; - - /* The boot CPU must be set before MADT/MPTABLE parsing happens */ - if (cpuid_to_apicid[0] == BAD_APICID) - panic("Boot CPU APIC not registered yet\n"); - - if (apicid == boot_cpu_physical_apicid) - return 0; - - if (disabled_cpu_apicid == apicid) { - int thiscpu = num_processors + disabled_cpus; - - pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENODEV; - } - - if (num_processors >= nr_cpu_ids) { - int thiscpu = max + disabled_cpus; - - pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " - "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); - - disabled_cpus++; - return -EINVAL; - } - - cpu = allocate_logical_cpuid(apicid); - if (cpu < 0) { - disabled_cpus++; - return -EINVAL; - } - - cpu_update_apic(cpu, apicid); - return cpu; -} - - void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) { @@ -2496,10 +2317,7 @@ EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); static void __init apic_bsp_up_setup(void) { -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid)); -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + reset_phys_cpu_present_map(boot_cpu_physical_apicid); } /** @@ -2764,19 +2582,12 @@ int apic_is_clustered_box(void) /* * APIC command line parameters */ -static int __init setup_disableapic(char *arg) +static int __init setup_nolapic(char *arg) { apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -2845,15 +2656,6 @@ static int __init lapic_insert_resource(void) */ late_initcall(lapic_insert_resource); -static int __init apic_set_disabled_cpu_apicid(char *arg) -{ - if (!arg || !get_option(&arg, &disabled_cpu_apicid)) - return -EINVAL; - - return 0; -} -early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); - static int __init apic_set_extnmi(char *arg) { if (!arg) diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 8a00141073ea..9ef3be866832 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -18,16 +18,6 @@ u32 apic_flat_calc_apicid(unsigned int cpu) return 1U << cpu; } -bool default_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return physid_isset(apicid, *map); -} - -void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - *retmap = *phys_map; -} - u32 default_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) @@ -37,11 +27,6 @@ u32 default_cpu_present_to_apicid(int mps_cpu) } EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid); -bool default_apic_id_registered(void) -{ - return physid_isset(read_apic_id(), phys_cpu_present_map); -} - /* * Set up the logical destination ID when the APIC operates in logical * destination mode. diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7139867d69cd..e0308d8c4e6c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -8,143 +8,25 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/cpumask.h> #include <linux/export.h> -#include <linux/acpi.h> -#include <asm/jailhouse_para.h> #include <asm/apic.h> #include "local.h" -static struct apic apic_physflat; -static struct apic apic_flat; - -struct apic *apic __ro_after_init = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 1; -} - -static void _flat_send_IPI_mask(unsigned long mask, int vector) -{ - unsigned long flags; - - local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); - local_irq_restore(flags); -} - -static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - - _flat_send_IPI_mask(mask, vector); -} - -static void -flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - int cpu = smp_processor_id(); - - if (cpu < BITS_PER_LONG) - __clear_bit(cpu, &mask); - - _flat_send_IPI_mask(mask, vector); -} - -static u32 flat_get_apic_id(u32 x) +static u32 physflat_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static u32 set_apic_id(u32 id) -{ - return (id & 0xFF) << 24; -} - -static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - -static int flat_probe(void) +static int physflat_probe(void) { return 1; } -static struct apic apic_flat __ro_after_init = { - .name = "flat", - .probe = flat_probe, - .acpi_madt_oem_check = flat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, - - .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .dest_mode_logical = true, - - .disable_esr = 0, - - .init_apic_ldr = default_init_apic_ldr, - .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, - - .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, - - .calc_dest_apicid = apic_flat_calc_apicid, - - .send_IPI = default_send_IPI_single, - .send_IPI_mask = flat_send_IPI_mask, - .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, - .send_IPI_allbutself = default_send_IPI_allbutself, - .send_IPI_all = default_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - .nmi_to_offline_cpu = true, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -/* - * Physflat mode is used when there are more than 8 CPUs on a system. - * We cannot use logical delivery in this case because the mask - * overflows, so use physical mode. - */ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { - printk(KERN_DEBUG "system APIC only can use physical flat"); - return 1; - } - - if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { - printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); - return 1; - } -#endif - - return 0; -} - -static int physflat_probe(void) -{ - return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); + return 1; } static struct apic apic_physflat __ro_after_init = { @@ -152,19 +34,15 @@ static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, - .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = flat_phys_pkg_id, .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = physflat_get_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -184,8 +62,7 @@ static struct apic apic_physflat __ro_after_init = { .wait_icr_idle = apic_mem_wait_icr_idle, .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; +apic_driver(apic_physflat); -/* - * We need to check for physflat first, so this order is important. - */ -apic_drivers(apic_physflat, apic_flat); +struct apic *apic __ro_after_init = &apic_physflat; +EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b00d52ae84fa..b5bb7a2e8340 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -29,7 +29,6 @@ static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } static u64 noop_apic_icr_read(void) { return 0; } -static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; } static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } @@ -47,17 +46,12 @@ static void noop_apic_write(u32 reg, u32 val) struct apic apic_noop __ro_after_init = { .name = "noop", - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = noop_phys_pkg_id, - .max_apic_id = 0xFE, .get_apic_id = noop_get_apic_id, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 456a14c44f67..16410f087b7a 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -38,11 +38,6 @@ static u32 numachip1_get_apic_id(u32 x) return id; } -static u32 numachip1_set_apic_id(u32 id) -{ - return (id & 0xff) << 24; -} - static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; @@ -51,16 +46,6 @@ static u32 numachip2_get_apic_id(u32 x) return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } -static u32 numachip2_set_apic_id(u32 id) -{ - return id << 24; -} - -static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - static void numachip1_apic_icr_write(int apicid, unsigned int val) { write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val); @@ -222,17 +207,14 @@ static const struct apic apic_numachip1 __refconst = { .probe = numachip1_probe, .acpi_madt_oem_check = numachip1_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip1_get_apic_id, - .set_apic_id = numachip1_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -259,17 +241,14 @@ static const struct apic apic_numachip2 __refconst = { .probe = numachip2_probe, .acpi_madt_oem_check = numachip2_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = numachip_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = numachip2_get_apic_id, - .set_apic_id = numachip2_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 7ee3c486cb33..9285d500d5b4 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -18,22 +18,6 @@ static u32 bigsmp_get_apic_id(u32 x) return (x >> 24) & 0xFF; } -static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid) -{ - return false; -} - -static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ - /* For clustered we don't have a good way to do this yet - hack */ - physids_promote(0xFFL, retmap); -} - -static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static void bigsmp_send_IPI_allbutself(int vector) { default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); @@ -80,19 +64,14 @@ static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 1, - .check_apicid_used = bigsmp_check_apicid_used, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = bigsmp_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = bigsmp_get_apic_id, - .set_apic_id = NULL, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 00da6cf6b07d..eebc360ed1bb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,8 +86,8 @@ static unsigned int ioapic_dynirq_base; static int ioapic_initialized; struct irq_pin_list { - struct list_head list; - int apic, pin; + struct list_head list; + int apic, pin; }; struct mp_chip_data { @@ -96,7 +96,7 @@ struct mp_chip_data { bool is_level; bool active_low; bool isa_irq; - u32 count; + u32 count; }; struct mp_ioapic_gsi { @@ -105,21 +105,17 @@ struct mp_ioapic_gsi { }; static struct ioapic { - /* - * # of IRQ routing registers - */ - int nr_registers; - /* - * Saved state during suspend/resume, or while enabling intr-remap. - */ - struct IO_APIC_route_entry *saved_registers; + /* # of IRQ routing registers */ + int nr_registers; + /* Saved state during suspend/resume, or while enabling intr-remap. */ + struct IO_APIC_route_entry *saved_registers; /* I/O APIC config */ - struct mpc_ioapic mp_config; + struct mpc_ioapic mp_config; /* IO APIC gsi routing info */ - struct mp_ioapic_gsi gsi_config; - struct ioapic_domain_cfg irqdomain_cfg; - struct irq_domain *irqdomain; - struct resource *iomem_res; + struct mp_ioapic_gsi gsi_config; + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct resource *iomem_res; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -205,10 +201,9 @@ void mp_save_irq(struct mpc_intsrc *m) { int i; - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, - m->srcbusirq, m->dstapic, m->dstirq); + apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); for (i = 0; i < mp_irq_entries; i++) { if (!memcmp(&mp_irqs[i], m, sizeof(*m))) @@ -269,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); } unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); return readl(&io_apic->data); } @@ -300,14 +297,8 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - struct IO_APIC_route_entry entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - entry = __ioapic_read_entry(apic, pin); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return entry; + guard(raw_spinlock_irqsave)(&ioapic_lock); + return __ioapic_read_entry(apic, pin); } /* @@ -324,11 +315,8 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __ioapic_write_entry(apic, pin, e); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -339,12 +327,10 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { struct IO_APIC_route_entry e = { .masked = true }; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_write(apic, 0x10 + 2*pin, e.w1); io_apic_write(apic, 0x11 + 2*pin, e.w2); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -352,68 +338,39 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) +static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin) { struct irq_pin_list *entry; - /* don't allow duplicates */ - for_each_irq_pin(entry, data->irq_2_pin) + /* Don't allow duplicates */ + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return 0; + return true; + } entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { - pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); - return -ENOMEM; + pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin); + return false; } + entry->apic = apic; entry->pin = pin; list_add_tail(&entry->list, &data->irq_2_pin); - - return 0; + return true; } static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) { if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); return; } -} - -static void add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) -{ - if (__add_pin_to_irq_node(data, node, apic, pin)) - panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, data->irq_2_pin) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - /* every one is different, right? */ - return; - } } - - /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(data, node, newapic, newpin); } static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, @@ -430,12 +387,12 @@ static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, } } +/* + * Synchronize the IO-APIC and the CPU by doing a dummy read from the + * IO-APIC + */ static void io_apic_sync(struct irq_pin_list *entry) { - /* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ struct io_apic __iomem *io_apic; io_apic = io_apic_base(entry->apic); @@ -445,11 +402,9 @@ static void io_apic_sync(struct irq_pin_list *entry) static void mask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_modify_irq(data, true, &io_apic_sync); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) @@ -460,11 +415,9 @@ static void __unmask_ioapic(struct mp_chip_data *data) static void unmask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __unmask_ioapic(data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -492,30 +445,24 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) entry = entry1 = __ioapic_read_entry(apic, pin); - /* - * Mask the entry and change the trigger mode to edge. - */ + /* Mask the entry and change the trigger mode to edge. */ entry1.masked = true; entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); - /* - * Restore the previous level triggered entry. - */ + /* Restore the previous level triggered entry. */ __ioapic_write_entry(apic, pin, entry); } } static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { - unsigned long flags; struct irq_pin_list *entry; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -538,8 +485,6 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) } if (entry.irr) { - unsigned long flags; - /* * Make sure the trigger mode is set to level. Explicit EOI * doesn't clear the remote-IRR if the trigger mode is not @@ -549,9 +494,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.is_level = true; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __eoi_ioapic_pin(apic, pin, entry.vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -586,28 +530,23 @@ static int pirq_entries[MAX_PIRQS] = { static int __init ioapic_pirq_setup(char *str) { - int i, max; - int ints[MAX_PIRQS+1]; + int i, max, ints[MAX_PIRQS+1]; get_options(str, ARRAY_SIZE(ints), ints); - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); + apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; if (ints[0] < MAX_PIRQS) max = ints[0]; for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ + apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]); + /* PIRQs are mapped upside down, usually */ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; } return 1; } - __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ @@ -626,8 +565,7 @@ int save_ioapic_entries(void) } for_each_pin(apic, pin) - ioapics[apic].saved_registers[pin] = - ioapic_read_entry(apic, pin); + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } return err; @@ -668,8 +606,7 @@ int restore_ioapic_entries(void) continue; for_each_pin(apic, pin) - ioapic_write_entry(apic, pin, - ioapics[apic].saved_registers[pin]); + ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]); } return 0; } @@ -681,12 +618,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type) { int i; - for (i = 0; i < mp_irq_entries; i++) + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].irqtype == type && (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; + } return -1; } @@ -701,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) - return mp_irqs[i].dstirq; } return -1; @@ -717,8 +653,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) break; } @@ -726,9 +661,10 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int ioapic_idx; - for_each_ioapic(ioapic_idx) + for_each_ioapic(ioapic_idx) { if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) return ioapic_idx; + } } return -1; @@ -769,8 +705,7 @@ static bool EISA_ELCR(unsigned int irq) unsigned int port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); + apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq); return false; } @@ -947,9 +882,9 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + int type = ioapics[ioapic].irqdomain_cfg.type; bool legacy = false; int irq = -1; - int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: @@ -971,8 +906,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return -1; } - return __irq_domain_alloc_irqs(domain, irq, 1, - ioapic_alloc_attr_node(info), + return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), info, legacy, NULL); } @@ -986,29 +920,26 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be * multiple pins sharing the same legacy IRQ number when ACPI is disabled. */ -static int alloc_isa_irq_from_domain(struct irq_domain *domain, - int irq, int ioapic, int pin, +static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin, struct irq_alloc_info *info) { - struct mp_chip_data *data; struct irq_data *irq_data = irq_get_irq_data(irq); int node = ioapic_alloc_attr_node(info); + struct mp_chip_data *data; /* * Legacy ISA IRQ has already been allocated, just add pin to * the pin list associated with this IRQ and program the IOAPIC - * entry. The IOAPIC entry + * entry. */ if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic.pin)) + if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, - NULL); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -1022,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { - int irq; - bool legacy = false; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); struct irq_alloc_info tmp; struct mp_chip_data *data; - struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + bool legacy = false; + int irq; if (!domain) return -ENOSYS; @@ -1046,7 +977,7 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, return -EINVAL; } - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (!(flags & IOAPIC_MAP_ALLOC)) { if (!legacy) { irq = irq_find_mapping(domain, pin); @@ -1067,8 +998,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, data->count++; } } - mutex_unlock(&ioapic_mutex); - return irq; } @@ -1076,26 +1005,20 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) { u32 gsi = mp_pin_to_gsi(ioapic, pin); - /* - * Debugging check, we are in big trouble if this message pops up! - */ + /* Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); #ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ + /* PCI IRQ command line redirection. Yes, limits are hardcoded. */ if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); + if (pirq_entries[pin - 16] != -1) { + if (!pirq_entries[pin - 16]) { + apic_pr_verbose("Disabling PIRQ%d\n", pin - 16); } else { int irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); + + apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq); return irq; } } @@ -1133,10 +1056,9 @@ void mp_unmap_irq(int irq) if (!data || data->isa_irq) return; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (--data->count == 0) irq_domain_free_irqs(irq, 1); - mutex_unlock(&ioapic_mutex); } /* @@ -1147,12 +1069,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int irq, i, best_ioapic = -1, best_idx = -1; - apic_printk(APIC_DEBUG, - "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); + apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, - "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } @@ -1197,8 +1117,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; out: - return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, - IOAPIC_MAP_ALLOC); + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC); } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); @@ -1209,17 +1128,16 @@ static void __init setup_IO_APIC_irqs(void) unsigned int ioapic, pin; int idx; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_pr_verbose("Init IO_APIC IRQs\n"); for_each_ioapic_pin(ioapic, pin) { idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - apic_printk(APIC_VERBOSE, - KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(ioapic), pin); - else - pin_2_irq(idx, ioapic, pin, - ioapic ? 0 : IOAPIC_MAP_ALLOC); + if (idx < 0) { + apic_pr_verbose("apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + } else { + pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC); + } } } @@ -1234,26 +1152,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) char buf[256]; int i; - printk(KERN_DEBUG "IOAPIC %d:\n", apic); + apic_dbg("IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { entry = ioapic_read_entry(apic, i); - snprintf(buf, sizeof(buf), - " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, - entry.masked ? "disabled" : "enabled ", + snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.masked ? "disabled" : "enabled ", entry.is_level ? "level" : "edge ", entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (entry.ir_format) { - printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, - (entry.ir_index_15 << 15) | entry.ir_index_0_14, - entry.ir_zero); + apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf, + (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero); } else { - printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, - entry.dest_mode_logical ? "logical " : "physical", - entry.virt_destid_8_14, entry.destid_0_7, - entry.delivery_mode); + apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode); } } } @@ -1264,30 +1177,25 @@ static void __init print_IO_APIC(int ioapic_idx) union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - reg_01.raw = io_apic_read(ioapic_idx, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(ioapic_idx, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(ioapic_idx, 3); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %02X\n", - reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %02X\n", - reg_01.bits.version); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + } + + apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + apic_dbg(".... register #00: %08X\n", reg_00.raw); + apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID); + apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS); + apic_dbg(".... register #01: %08X\n", *(int *)®_01); + apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries); + apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1295,8 +1203,8 @@ static void __init print_IO_APIC(int ioapic_idx) * value, so ignore it if reg_02 == reg_01. */ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + apic_dbg(".... register #02: %08X\n", reg_02.raw); + apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration); } /* @@ -1306,11 +1214,11 @@ static void __init print_IO_APIC(int ioapic_idx) */ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + apic_dbg(".... register #03: %08X\n", reg_03.raw); + apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT); } - printk(KERN_DEBUG ".... IRQ redirection table:\n"); + apic_dbg(".... IRQ redirection table:\n"); io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } @@ -1319,11 +1227,11 @@ void __init print_IO_APICs(void) int ioapic_idx; unsigned int irq; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for_each_ioapic(ioapic_idx) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mpc_ioapic_id(ioapic_idx), - ioapics[ioapic_idx].nr_registers); + apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries); + for_each_ioapic(ioapic_idx) { + apic_dbg("number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers); + } /* * We are a bit conservative about what we expect. We have to @@ -1334,7 +1242,7 @@ void __init print_IO_APICs(void) for_each_ioapic(ioapic_idx) print_IO_APIC(ioapic_idx); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); + apic_dbg("IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; struct irq_chip *chip; @@ -1349,7 +1257,7 @@ void __init print_IO_APICs(void) if (list_empty(&data->irq_2_pin)) continue; - printk(KERN_DEBUG "IRQ%d ", irq); + apic_dbg("IRQ%d ", irq); for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); @@ -1363,8 +1271,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { - int i8259_apic, i8259_pin; - int apic, pin; + int i8259_apic, i8259_pin, apic, pin; if (ioapic_is_disabled) nr_ioapics = 0; @@ -1376,19 +1283,21 @@ void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. + /* + * If the interrupt line is enabled and in ExtInt mode I + * have found the pin where the i8259 is connected. */ - if (!entry.masked && - entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { + if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; - goto found_i8259; + break; } } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic + + /* + * Look to see what if the MP table has reported the ExtINT + * + * If we could not find the appropriate pin by looking at the ioapic * the i8259 probably is not connected the ioapic but give the * mptable a chance anyway. */ @@ -1396,29 +1305,24 @@ void __init enable_IO_APIC(void) i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + pr_warn("ExtINT not setup in hardware but reported by MP table\n"); ioapic_i8259.pin = i8259_pin; ioapic_i8259.apic = i8259_apic; } /* Complain if the MP table and the hardware disagree */ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + pr_warn("ExtINT in hardware and MP table differ\n"); - /* - * Do not trust the IO-APIC being empty at bootup - */ + /* Do not trust the IO-APIC being empty at bootup */ clear_IO_APIC(); } void native_restore_boot_irq_mode(void) { /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. + * If the i8259 is routed through an IOAPIC Put that IOAPIC in + * virtual wire mode so legacy interrupts can be delivered. */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; @@ -1433,9 +1337,7 @@ void native_restore_boot_irq_mode(void) entry.destid_0_7 = apic_id & 0xFF; entry.virt_destid_8_14 = apic_id >> 8; - /* - * Add it to the IO-APIC irq-routing table: - */ + /* Add it to the IO-APIC irq-routing table */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } @@ -1458,37 +1360,34 @@ void restore_boot_irq_mode(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ -void __init setup_ioapic_ids_from_mpc_nocheck(void) +static void __init setup_ioapic_ids_from_mpc_nocheck(void) { + DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int ioapic_idx; - int i; unsigned char old_id; - unsigned long flags; + int ioapic_idx, i; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + copy_phys_cpu_present_map(phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. */ for_each_ioapic(ioapic_idx) { /* Read the register 0 value */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic_idx, 0); old_id = mpc_ioapic_id(ioapic_idx); - if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); + if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) { + pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; } @@ -1497,61 +1396,54 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(&phys_id_present_map, - mpc_ioapic_id(ioapic_idx))) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - ioapic_idx, mpc_ioapic_id(ioapic_idx)); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) + if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) { + pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < broadcast_id; i++) + if (!test_bit(i, phys_id_present_map)) break; - if (i >= get_physical_broadcast()) + if (i >= broadcast_id) panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); + pr_err("... fixing up to %d. (tell your hw vendor)\n", i); + set_bit(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); - physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map); + apic_pr_verbose("Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* - * We need to adjust the IRQ routing table - * if the ID changed. + * We need to adjust the IRQ routing table if the ID + * changed. */ - if (old_id != mpc_ioapic_id(ioapic_idx)) - for (i = 0; i < mp_irq_entries; i++) + if (old_id != mpc_ioapic_id(ioapic_idx)) { + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].dstapic == old_id) - mp_irqs[i].dstapic - = mpc_ioapic_id(ioapic_idx); + mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx); + } + } /* - * Update the ID register according to the right value - * from the MPC table if they are different. + * Update the ID register according to the right value from + * the MPC table if they are different. */ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) continue; - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mpc_ioapic_id(ioapic_idx)); + apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_idx, 0, reg_00.raw); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic_idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_idx, 0); + } + /* Sanity check */ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) pr_cont("could not set ID!\n"); else - apic_printk(APIC_VERBOSE, " ok.\n"); + apic_pr_verbose(" ok.\n"); } } @@ -1596,8 +1488,7 @@ static void __init delay_with_tsc(void) do { rep_nop(); now = rdtsc(); - } while ((now - start) < 40000000000ULL / HZ && - time_before_eq(jiffies, end)); + } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } static void __init delay_without_tsc(void) @@ -1658,36 +1549,29 @@ static int __init timer_irq_works(void) * so we 'resend' these IRQs via IPIs, to the same CPU. It's much * better to do it this way as thus we do not have to be aware of * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... + * + * Edge triggered needs to resend any interrupt that was delayed but this + * is now handled in the device independent code. + * + * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to + * make sure that we get the edge. If it is already asserted for some + * reason, we need return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake an edge even if it + * isn't on the 8259A... */ static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); if (irq < nr_legacy_irqs()) { legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; } __unmask_ioapic(data->chip_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return was_pending; } @@ -1697,9 +1581,8 @@ atomic_t irq_mis_count; static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) { struct IO_APIC_route_entry e; int pin; @@ -1707,13 +1590,9 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) pin = entry->pin; e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (e.irr) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (e.irr) return true; - } } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return false; } @@ -1731,7 +1610,8 @@ static inline bool ioapic_prepare_move(struct irq_data *data) static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { if (unlikely(moveit)) { - /* Only migrate the irq if the ack has been received. + /* + * Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets * delayed going to ioapics, and if we reprogram the @@ -1914,18 +1794,16 @@ static void ioapic_configure_entry(struct irq_data *irqd) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } -static int ioapic_set_affinity(struct irq_data *irq_data, - const struct cpumask *mask, bool force) +static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); - raw_spin_lock_irqsave(&ioapic_lock, flags); + + guard(raw_spinlock_irqsave)(&ioapic_lock); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -1944,9 +1822,8 @@ static int ioapic_set_affinity(struct irq_data *irq_data, * * Verify that the corresponding Remote-IRR bits are clear. */ -static int ioapic_irq_get_chip_state(struct irq_data *irqd, - enum irqchip_irq_state which, - bool *state) +static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which, + bool *state) { struct mp_chip_data *mcd = irqd->chip_data; struct IO_APIC_route_entry rentry; @@ -1956,7 +1833,8 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, return -EINVAL; *state = false; - raw_spin_lock(&ioapic_lock); + + guard(raw_spinlock)(&ioapic_lock); for_each_irq_pin(p, mcd->irq_2_pin) { rentry = __ioapic_read_entry(p->apic, p->pin); /* @@ -1970,7 +1848,6 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, break; } } - raw_spin_unlock(&ioapic_lock); return 0; } @@ -1984,7 +1861,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE | + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED | IRQCHIP_AFFINITY_PRE_STARTUP, }; @@ -2011,14 +1888,13 @@ static inline void init_IO_APIC_traps(void) cfg = irq_cfg(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. + * Hmm.. We don't have an entry for this, so + * default to an old-fashioned 8259 interrupt if we + * can. Otherwise set the dummy interrupt chip. */ if (irq < nr_legacy_irqs()) legacy_pic->make_irq(irq); else - /* Strange. Oh, well.. */ irq_set_chip(irq, &no_irq_chip); } } @@ -2027,20 +1903,17 @@ static inline void init_IO_APIC_traps(void) /* * The local APIC irq-chip implementation: */ - static void mask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } @@ -2059,8 +1932,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } /* @@ -2072,9 +1944,9 @@ static void lapic_register_intr(int irq) */ static inline void __init unlock_ExtINT_logic(void) { - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + struct IO_APIC_route_entry entry0, entry1; + int apic, pin, i; u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); @@ -2134,10 +2006,10 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); -static int mp_alloc_timer_irq(int ioapic, int pin) +static int __init mp_alloc_timer_irq(int ioapic, int pin) { - int irq = -1; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + int irq = -1; if (domain) { struct irq_alloc_info info; @@ -2145,21 +2017,36 @@ static int mp_alloc_timer_irq(int ioapic, int pin) ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.devid = mpc_ioapic_id(ioapic); info.ioapic.pin = pin; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); - mutex_unlock(&ioapic_mutex); } return irq; } +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + return; + } + } + + /* Old apic/pin didn't exist, so just add a new one */ + add_pin_to_irq_node(data, node, newapic, newpin); +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { @@ -2197,9 +2084,8 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2209,7 +2095,7 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { - panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2243,13 +2129,10 @@ static inline void __init check_timer(void) panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); + pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n"); + pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2258,7 +2141,7 @@ static inline void __init check_timer(void) irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + pr_info("....... works.\n"); goto out; } /* @@ -2266,26 +2149,24 @@ static inline void __init check_timer(void) */ legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + pr_info("....... failed.\n"); } - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); + pr_info("...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + pr_info("..... failed.\n"); - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); + pr_info("...trying to set up timer as ExtINT IRQ...\n"); legacy_pic->init(0); legacy_pic->make_irq(0); @@ -2295,14 +2176,15 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - if (apic_is_x2apic_enabled()) - apic_printk(APIC_QUIET, KERN_INFO - "Perhaps problem with the pre-enabled x2apic mode\n" - "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + + pr_info("..... failed :\n"); + if (apic_is_x2apic_enabled()) { + pr_info("Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + } panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: @@ -2330,11 +2212,11 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_domain *parent; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; - struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct irq_domain *parent; struct fwnode_handle *fn; struct irq_fwspec fwspec; @@ -2354,7 +2236,7 @@ static int mp_irqdomain_create(int ioapic) fwspec.param_count = 1; fwspec.param[0] = mpc_ioapic_id(ioapic); - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { if (!cfg->dev) irq_domain_free_fwnode(fn); @@ -2370,10 +2252,8 @@ static int mp_irqdomain_create(int ioapic) return -ENOMEM; } - if (cfg->type == IOAPIC_DOMAIN_LEGACY || - cfg->type == IOAPIC_DOMAIN_STRICT) - ioapic_dynirq_base = max(ioapic_dynirq_base, - gsi_cfg->gsi_end + 1); + if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); return 0; } @@ -2400,13 +2280,11 @@ void __init setup_IO_APIC(void) io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + apic_pr_verbose("ENABLING IO-APIC IRQs\n"); for_each_ioapic(ioapic) BUG_ON(mp_irqdomain_create(ioapic)); - /* - * Set up IO-APIC IRQ routing. - */ + /* Set up IO-APIC IRQ routing. */ x86_init.mpparse.setup_ioapic_ids(); sync_Arb_IDs(); @@ -2420,16 +2298,14 @@ void __init setup_IO_APIC(void) static void resume_ioapic_id(int ioapic_idx) { - unsigned long flags; union IO_APIC_reg_00 reg_00; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_00.raw = io_apic_read(ioapic_idx, 0); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); io_apic_write(ioapic_idx, 0, reg_00.raw); } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ioapic_resume(void) @@ -2443,8 +2319,8 @@ static void ioapic_resume(void) } static struct syscore_ops ioapic_syscore_ops = { - .suspend = save_ioapic_entries, - .resume = ioapic_resume, + .suspend = save_ioapic_entries, + .resume = ioapic_resume, }; static int __init ioapic_init_ops(void) @@ -2459,15 +2335,13 @@ device_initcall(ioapic_init_ops); static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - /* The register returns the maximum index redir index - * supported, which is one less than the total number of redir - * entries. + /* + * The register returns the maximum index redir index supported, + * which is one less than the total number of redir entries. */ return reg_01.bits.entries + 1; } @@ -2494,93 +2368,71 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) #ifdef CONFIG_X86_32 static int io_apic_get_unique_id(int ioapic, int apic_id) { + static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC); + const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; int i = 0; - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ + /* Initialize the ID map */ + if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC)) + copy_phys_cpu_present_map(apic_id_map); - if (physids_empty(apic_id_map)) - apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic, 0); - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); + if (apic_id >= broadcast_id) { + pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n", + ioapic, apic_id, reg_00.bits.ID); apic_id = reg_00.bits.ID; } - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (apic->check_apicid_used(&apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(&apic_id_map, i)) + /* Every APIC in a system must have a unique ID */ + if (test_bit(apic_id, apic_id_map)) { + for (i = 0; i < broadcast_id; i++) { + if (!test_bit(i, apic_id_map)) break; } - if (i == get_physical_broadcast()) + if (i == broadcast_id) panic("Max apic_id exceeded!\n"); - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - + pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i); apic_id = i; } - physid_set_mask_of_physid(apic_id, &tmp); - physids_or(apic_id_map, apic_id_map, tmp); + set_bit(apic_id, apic_id_map); if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + } /* Sanity check */ if (reg_00.bits.ID != apic_id) { - pr_err("IOAPIC[%d]: Unable to change apic_id!\n", - ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); return -1; } } - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); return apic_id; } static u8 io_apic_unique_id(int idx, u8 id) { - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(boot_cpu_apic_version)) + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); - else - return id; + return id; } #else static u8 io_apic_unique_id(int idx, u8 id) { union IO_APIC_reg_00 reg_00; DECLARE_BITMAP(used, 256); - unsigned long flags; u8 new_id; int i; @@ -2596,26 +2448,23 @@ static u8 io_apic_unique_id(int idx, u8 id) * Read the current id from the ioapic and keep it if * available. */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(idx, 0); + new_id = reg_00.bits.ID; if (!test_bit(new_id, used)) { - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", - idx, new_id, id); + apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); return new_id; } - /* - * Get the next free id and write it to the ioapic. - */ + /* Get the next free id and write it to the ioapic. */ new_id = find_first_zero_bit(used, 256); reg_00.bits.ID = new_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(idx, 0, reg_00.raw); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + } /* Sanity check */ BUG_ON(reg_00.bits.ID != new_id); @@ -2625,12 +2474,10 @@ static u8 io_apic_unique_id(int idx, u8 id) static int io_apic_get_version(int ioapic) { - union IO_APIC_reg_01 reg_01; - unsigned long flags; + union IO_APIC_reg_01 reg_01; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } @@ -2645,8 +2492,8 @@ static struct resource *ioapic_resources; static struct resource * __init ioapic_setup_resources(void) { - unsigned long n; struct resource *res; + unsigned long n; char *mem; int i; @@ -2656,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, SMP_CACHE_BYTES); - if (!mem) - panic("%s: Failed to allocate %lu bytes\n", __func__, n); + mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; @@ -2706,9 +2551,7 @@ void __init io_apic_init_mappings(void) ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " + pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; ioapic_is_disabled = true; @@ -2719,17 +2562,13 @@ void __init io_apic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - if (!ioapic_phys) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), - ioapic_phys); + apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); idx++; ioapic_res->start = ioapic_phys; @@ -2740,13 +2579,12 @@ fake_ioapic_page: void __init ioapic_insert_resources(void) { - int i; struct resource *r = ioapic_resources; + int i; if (!r) { if (nr_ioapics > 0) - printk(KERN_ERR - "IO APIC resources couldn't be allocated.\n"); + pr_err("IO APIC resources couldn't be allocated.\n"); return; } @@ -2766,11 +2604,12 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for_each_ioapic(i) { struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) return i; } - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); return -1; } @@ -2809,12 +2648,10 @@ static int bad_ioapic_register(int idx) static int find_free_ioapic_entry(void) { - int idx; - - for (idx = 0; idx < MAX_IO_APICS; idx++) + for (int idx = 0; idx < MAX_IO_APICS; idx++) { if (ioapics[idx].nr_registers == 0) return idx; - + } return MAX_IO_APICS; } @@ -2825,8 +2662,7 @@ static int find_free_ioapic_entry(void) * @gsi_base: base of GSI associated with the IOAPIC * @cfg: configuration information for the IOAPIC */ -int mp_register_ioapic(int id, u32 address, u32 gsi_base, - struct ioapic_domain_cfg *cfg) +int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg) { bool hotplug = !!ioapic_initialized; struct mp_ioapic_gsi *gsi_cfg; @@ -2837,12 +2673,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); return -EINVAL; } - for_each_ioapic(ioapic) + + for_each_ioapic(ioapic) { if (ioapics[ioapic].mp_config.apicaddr == address) { - pr_warn("address 0x%x conflicts with IOAPIC%d\n", - address, ioapic); + pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic); return -EEXIST; } + } idx = find_free_ioapic_entry(); if (idx >= MAX_IO_APICS) { @@ -2877,8 +2714,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, (gsi_end >= gsi_cfg->gsi_base && gsi_end <= gsi_cfg->gsi_end)) { pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", - gsi_base, gsi_end, - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end); clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENOSPC; } @@ -2912,8 +2748,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].nr_registers = entries; pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", - idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), gsi_cfg->gsi_base, gsi_cfg->gsi_end); return 0; @@ -2924,11 +2759,13 @@ int mp_unregister_ioapic(u32 gsi_base) int ioapic, pin; int found = 0; - for_each_ioapic(ioapic) + for_each_ioapic(ioapic) { if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { found = 1; break; } + } + if (!found) { pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); return -ENODEV; @@ -2942,8 +2779,7 @@ int mp_unregister_ioapic(u32 gsi_base) if (irq >= 0) { data = irq_get_chip_data(irq); if (data && data->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); + pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic); return -EBUSY; } } @@ -2978,8 +2814,7 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, if (info && info->ioapic.valid) { data->is_level = info->ioapic.is_level; data->active_low = info->ioapic.active_low; - } else if (__acpi_get_override_irq(gsi, &data->is_level, - &data->active_low) < 0) { + } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ data->is_level = true; data->active_low = true; @@ -3037,10 +2872,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -ENOMEM; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); - if (ret < 0) { - kfree(data); - return ret; - } + if (ret < 0) + goto free_data; INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic.pin; @@ -3049,7 +2882,10 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) { + ret = -ENOMEM; + goto free_irqs; + } mp_preconfigure_entry(data); mp_register_handler(virq, data->is_level); @@ -3059,11 +2895,15 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, legacy_pic->mask(virq); local_irq_restore(flags); - apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", - ioapic, mpc_ioapic_id(ioapic), pin, virq, - data->is_level, data->active_low); + apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low); return 0; + +free_irqs: + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +free_data: + kfree(data); + return ret; } void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, @@ -3076,22 +2916,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { data = irq_data->chip_data; - __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool reserve) +int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } @@ -3099,8 +2934,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data) { /* It won't be called for IRQ with multiple IOAPIC pins associated */ - ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); } int mp_irqdomain_ioapic_idx(struct irq_domain *domain) diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 9ea6186ea88c..842fe28496be 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -16,8 +16,6 @@ /* X2APIC */ void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest); u32 x2apic_get_apic_id(u32 id); -u32 x2apic_set_apic_id(u32 id); -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb); void x2apic_send_IPI_all(int vector); void x2apic_send_IPI_allbutself(int vector); @@ -63,9 +61,6 @@ void default_send_IPI_allbutself(int vector); void default_send_IPI_all(int vector); void default_send_IPI_self(int vector); -bool default_apic_id_registered(void); -bool default_check_apicid_used(physid_mask_t *map, u32 apicid); - #ifdef CONFIG_X86_32 void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d9651f15ae4f..66bc5d3e79db 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -184,7 +184,6 @@ static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; return 0; case DOMAIN_BUS_PCI_DEVICE_MSIX: - case DOMAIN_BUS_PCI_DEVICE_IMS: alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; return 0; default: @@ -215,6 +214,7 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, if (WARN_ON_ONCE(domain != real_parent)) return false; info->chip->irq_set_affinity = msi_set_affinity; + info->chip->flags |= IRQCHIP_MOVE_DEFERRED; break; case DOMAIN_BUS_DMAR: case DOMAIN_BUS_AMDVI: @@ -229,10 +229,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, case DOMAIN_BUS_PCI_DEVICE_MSI: case DOMAIN_BUS_PCI_DEVICE_MSIX: break; - case DOMAIN_BUS_PCI_DEVICE_IMS: - if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) - return false; - break; default: WARN_ON_ONCE(1); return false; @@ -320,7 +316,7 @@ static struct irq_chip dmar_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE | + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED | IRQCHIP_AFFINITY_PRE_STARTUP, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 5eb3fbe472da..f75ee345c02d 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -18,11 +18,6 @@ #include "local.h" -static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb) -{ - return cpuid_apic >> index_msb; -} - static u32 default_get_apic_id(u32 x) { unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); @@ -43,18 +38,13 @@ static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, - .apic_id_registered = default_apic_id_registered, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = default_phys_pkg_id, .max_apic_id = 0xFE, .get_apic_id = default_get_apic_id, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 319448d87b99..736f62812f5c 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -712,8 +712,8 @@ int __init arch_probe_nr_irqs(void) { int nr; - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; + if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids) + irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids); nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) @@ -725,8 +725,8 @@ int __init arch_probe_nr_irqs(void) else nr += gsi_top * 16; #endif - if (nr < nr_irqs) - nr_irqs = nr; + if (nr < irq_get_nr_irqs()) + irq_set_nr_irqs(nr); /* * We don't know if PIC is present at this point so we need to do @@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void) void lapic_assign_legacy_vector(unsigned int irq, bool replace) { /* - * Use assign system here so it wont get accounted as allocated - * and moveable in the cpu hotplug check and it prevents managed + * Use assign system here so it won't get accounted as allocated + * and movable in the cpu hotplug check and it prevents managed * irq reservation from touching it. */ irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); @@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) lockdep_assert_held(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { - unsigned int irr, vector = apicd->prev_vector; + unsigned int vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) * fixup_irqs() was just called to scan IRR for set bits and * forward them to new destination CPUs via IPIs. */ - irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; - if (irr & (1U << (vector % 32))) { + if (check_irr && is_vector_pending(vector)) { pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); rearm = true; continue; @@ -1036,7 +1035,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd) add_timer_on(&cl->timer, cpu); } } else { - apicd->prev_vector = 0; + pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); } @@ -1073,6 +1073,7 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { + unsigned int cpu = smp_processor_id(); struct apic_chip_data *apicd; struct irq_data *irqd; unsigned int vector; @@ -1097,10 +1098,11 @@ void irq_force_complete_move(struct irq_desc *desc) goto unlock; /* - * If prev_vector is empty, no action required. + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. */ vector = apicd->prev_vector; - if (!vector) + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) goto unlock; /* diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a8306089c91b..7db83212effb 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu) u32 phys_apicid = apic->cpu_present_to_apicid(cpu); u32 cluster = apic_cluster(phys_apicid); u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + int node = cpu_to_node(cpu); x86_cpu_to_logical_apicid[cpu] = logical_apicid; - if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) + if (alloc_clustermask(cpu, cluster, node) < 0) return -ENOMEM; - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + + if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node)) return -ENOMEM; + return 0; } @@ -227,21 +230,16 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = true, .disable_esr = 0, - .check_apicid_used = NULL, .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = x2apic_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 558a4a8824f4..12d4c35547a6 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -129,34 +129,21 @@ u32 x2apic_get_apic_id(u32 id) return id; } -u32 x2apic_set_apic_id(u32 id) -{ - return id; -} - -u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} - static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = x2apic_phys_pkg_id, .max_apic_id = UINT_MAX, .x2apic_set_max_apicid = true, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = x2apic_set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1b0d7336a28f..7fef504ca508 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -241,54 +241,20 @@ static void __init uv_tsc_check_sync(void) is_uv(UV3) ? sname.s3.field : \ undef) -/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */ - -#define SMT_LEVEL 0 /* Leaf 0xb SMT level */ -#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */ -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) - -static void set_x2apic_bits(void) -{ - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int sid_shift; - - cpuid(0, &eax, &ebx, &ecx, &edx); - if (eax < 0xb) { - pr_info("UV: CPU does not have CPUID.11\n"); - return; - } - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) { - pr_info("UV: CPUID.11 not implemented\n"); - return; - } - - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - sid_shift = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - uv_cpuid.apicid_shift = 0; - uv_cpuid.apicid_mask = (~(-1 << sid_shift)); - uv_cpuid.socketid_shift = sid_shift; -} - static void __init early_get_apic_socketid_shift(void) { + unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN); + if (is_uv2_hub() || is_uv3_hub()) uvh_apicid.v = uv_early_read_mmr(UVH_APICID); - set_x2apic_bits(); + if (sid_shift) { + uv_cpuid.apicid_shift = 0; + uv_cpuid.apicid_mask = (~(-1 << sid_shift)); + uv_cpuid.socketid_shift = sid_shift; + } else { + pr_info("UV: CPU does not have valid CPUID.11\n"); + } pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask); pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); @@ -779,21 +745,6 @@ static void uv_send_IPI_all(int vector) uv_send_IPI_mask(cpu_online_mask, vector); } -static u32 set_apic_id(u32 id) -{ - return id; -} - -static unsigned int uv_read_apic_id(void) -{ - return x2apic_get_apic_id(apic_read(APIC_ID)); -} - -static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb) -{ - return uv_read_apic_id() >> index_msb; -} - static int uv_probe(void) { return apic == &apic_x2apic_uv_x; @@ -805,17 +756,14 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, - .delivery_mode = APIC_DELIVERY_MODE_FIXED, .dest_mode_logical = false, .disable_esr = 0, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .phys_pkg_id = uv_phys_pkg_id, .max_apic_id = UINT_MAX, .get_apic_id = x2apic_get_apic_id, - .set_apic_id = set_apic_id, .calc_dest_apicid = apic_default_calc_apicid, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5934ee5bc087..b37ab1095707 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex); * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS, (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -1055,35 +1055,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) return APM_SUCCESS; } -#if 0 -static int apm_get_battery_status(u_short which, u_short *status, - u_short *bat, u_short *life, u_short *nbat) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - - if (apm_info.connection_version < 0x0102) { - /* pretend we only have one battery. */ - if (which != 1) - return APM_BAD_DEVICE; - *nbat = 1; - return apm_get_power_status(status, bat, life); - } - - if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; - *life = edx; - *nbat = esi; - return APM_SUCCESS; -} -#endif - /** * apm_engage_power_management - enable PM on a device * @device: identity of device diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 6913b372ccf7..a98020bf31bb 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -109,7 +109,7 @@ static void __used common(void) OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_CALL_DEPTH_TRACKING +#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING OFFSET(X86_call_depth, pcpu_hot, call_depth); #endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index e9ad518a5003..8418a892d195 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -24,6 +24,8 @@ static int __initdata_or_module debug_callthunks; +#define MAX_PATCH_LEN (255-1) + #define prdbg(fmt, args...) \ do { \ if (debug_callthunks) \ @@ -42,8 +44,8 @@ DEFINE_PER_CPU(u64, __x86_call_count); DEFINE_PER_CPU(u64, __x86_ret_count); DEFINE_PER_CPU(u64, __x86_stuffs_count); DEFINE_PER_CPU(u64, __x86_ctxsw_count); -EXPORT_SYMBOL_GPL(__x86_ctxsw_count); -EXPORT_SYMBOL_GPL(__x86_call_count); +EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count); +EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count); #endif extern s32 __call_sites[], __call_sites_end[]; @@ -137,14 +139,15 @@ static bool skip_addr(void *dest) return true; #endif #ifdef CONFIG_KEXEC_CORE +# ifdef CONFIG_X86_64 + if (dest >= (void *)__relocate_kernel_start && + dest < (void *)__relocate_kernel_end) + return true; +# else if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; -#endif -#ifdef CONFIG_XEN - if (dest >= (void *)hypercall_page && - dest < (void*)hypercall_page + PAGE_SIZE) - return true; +# endif #endif return false; } @@ -179,10 +182,14 @@ static const u8 nops[] = { static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; + u8 insn_buff[MAX_PATCH_LEN]; u8 *pad = dest - tsize; + memcpy(insn_buff, skl_call_thunk_template, tsize); + apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + /* Already patched? */ - if (!bcmp(pad, skl_call_thunk_template, tsize)) + if (!bcmp(pad, insn_buff, tsize)) return pad; /* Ensure there are nops */ @@ -192,9 +199,9 @@ static void *patch_dest(void *dest, bool direct) } if (direct) - memcpy(pad, skl_call_thunk_template, tsize); + memcpy(pad, insn_buff, tsize); else - text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true); + text_poke_copy_locked(pad, insn_buff, tsize, true); return pad; } @@ -233,14 +240,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) } static __init_or_module void -patch_paravirt_call_sites(struct paravirt_patch_site *start, - struct paravirt_patch_site *end, - const struct core_text *ct) +patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, + const struct core_text *ct) { - struct paravirt_patch_site *p; + struct alt_instr *a; - for (p = start; p < end; p++) - patch_call(p->instr, ct); + for (a = start; a < end; a++) + patch_call((void *)&a->instr_offset + a->instr_offset, ct); } static __init_or_module void @@ -248,7 +254,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct); + patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -257,8 +263,8 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .pv_start = __parainstructions, - .pv_end = __parainstructions_end + .alt_start = __alt_instructions, + .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) @@ -291,20 +297,26 @@ void *callthunks_translate_call_dest(void *dest) static bool is_callthunk(void *addr) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; unsigned long dest; + u8 *pad; dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT); if (!thunks_initialized || skip_addr((void *)dest)) return false; - return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size); + pad = (void *)(dest - tmpl_size); + + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + + return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; - void *tmpl = skl_call_thunk_template; + u8 insn_buff[MAX_PATCH_LEN]; if (!thunks_initialized) return 0; @@ -313,7 +325,10 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) if (func && is_callthunk(func)) return 0; - memcpy(*pprog, tmpl, tmpl_size); + memcpy(insn_buff, skl_call_thunk_template, tmpl_size); + apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + + memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; return tmpl_size; } diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d..303bf74d175b 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c index 8674a5c0c031..e6bf78fac146 100644 --- a/arch/x86/kernel/cfi.c +++ b/arch/x86/kernel/cfi.c @@ -4,10 +4,10 @@ * * Copyright (C) 2022 Google LLC */ -#include <asm/cfi.h> +#include <linux/string.h> +#include <linux/cfi.h> #include <asm/insn.h> #include <asm/insn-eval.h> -#include <linux/string.h> /* * Returns the target address and the expected type when regs->ip points diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 93eabf544031..4efdf5c2efc8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n -obj-y := cacheinfo.o scattered.o topology.o +obj-y := cacheinfo.o scattered.o +obj-y += topology_common.o topology_ext.o topology_amd.o obj-y += common.o obj-y += rdrand.o obj-y += match.o @@ -25,14 +26,16 @@ obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o obj-y += umwait.o +obj-y += capflags.o powerflags.o -obj-$(CONFIG_PROC_FS) += proc.o -obj-y += capflags.o powerflags.o +obj-$(CONFIG_X86_LOCAL_APIC) += topology.o -obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o +obj-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o tsx.o -obj-$(CONFIG_PM) += intel_epb.o +obj-y += intel.o tsx.o +obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o @@ -56,8 +59,10 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o +obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o + quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^ cpufeature = $(src)/../../include/asm/cpufeatures.h vmxfeature = $(src)/../../include/asm/vmxfeatures.h diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index bfeb18fad63f..2c5b51aad91a 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -26,8 +26,8 @@ static u32 __init acrn_detect(void) static void __init acrn_init_platform(void) { - /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); + /* Install system interrupt handler for ACRN hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); x86_platform.calibrate_tsc = acrn_get_tsc_khz; x86_platform.calibrate_cpu = acrn_get_tsc_khz; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f322ebd053a9..54194f5995de 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -13,6 +13,7 @@ #include <asm/apic.h> #include <asm/cacheinfo.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/numa.h> @@ -20,6 +21,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/sev.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -27,94 +29,6 @@ #include "cpu.h" -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX - * Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - -/* - * AMD errata checking - * - * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or - * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that - * have an OSVW id assigned, which it takes as first argument. Both take a - * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). - * - * Example: - * - * const int amd_erratum_319[] = - * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), - * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), - * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); - */ - -#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } -#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } -#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ - ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) -#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) -#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) -#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) - -static const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), - AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); - -static const int amd_erratum_383[] = - AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); - -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - -static const int amd_zenbleed[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), - AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), - AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), - AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); - -static const int amd_div0[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), - AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); - -static const int amd_erratum_1485[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), - AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); - -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) -{ - int osvw_id = *erratum++; - u32 range; - u32 ms; - - if (osvw_id >= 0 && osvw_id < 65536 && - cpu_has(cpu, X86_FEATURE_OSVW)) { - u64 osvw_len; - - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); - if (osvw_id < osvw_len) { - u64 osvw_bits; - - rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), - osvw_bits); - return osvw_bits & (1ULL << (osvw_id & 0x3f)); - } - } - - /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 4) | cpu->x86_stepping; - while ((range = *erratum++)) - if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && - (ms >= AMD_MODEL_RANGE_START(range)) && - (ms <= AMD_MODEL_RANGE_END(range))) - return true; - - return false; -} - static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { u32 gprs[8] = { 0 }; @@ -381,97 +295,6 @@ static int nearby_node(int apicid) } #endif -/* - * Fix up topo::core_id for pre-F17h systems to be in the - * [0 .. cores_per_node - 1] range. Not really needed but - * kept so as not to break existing setups. - */ -static void legacy_fixup_core_id(struct cpuinfo_x86 *c) -{ - u32 cus_per_node; - - if (c->x86 >= 0x17) - return; - - cus_per_node = c->x86_max_cores / nodes_per_socket; - c->topo.core_id %= cus_per_node; -} - -/* - * Fixup core topology information for - * (1) AMD multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) AMD processors supporting compute units - */ -static void amd_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - if (c->x86 == 0x15) - c->topo.cu_id = ebx & 0xff; - - if (c->x86 >= 0x17) { - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - } - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - cacheinfo_amd_init_llc_id(c); - - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) { - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - legacy_fixup_core_id(c); - } -} - -/* - * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void amd_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* use socket ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - -u32 amd_get_nodes_per_socket(void) -{ - return nodes_per_socket; -} -EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket); - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -523,29 +346,30 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_amd_mc(struct cpuinfo_x86 *c) +static void bsp_determine_snp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined or for + * processors which support the architecturally defined RMPREAD + * instruction. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + (cpu_feature_enabled(X86_FEATURE_ZEN3) || + cpu_feature_enabled(X86_FEATURE_ZEN4) || + cpu_feature_enabled(X86_FEATURE_RMPREAD)) && + snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } } - - c->x86_coreid_bits = bits; #endif } @@ -581,18 +405,6 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD) && c->x86 >= 0x15 && c->x86 <= 0x17) { @@ -616,6 +428,62 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } resctrl_cpu_detect(c); + + /* Figure out Zen generations: */ + switch (c->x86) { + case 0x17: + switch (c->x86_model) { + case 0x00 ... 0x2f: + case 0x50 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN1); + break; + case 0x30 ... 0x4f: + case 0x60 ... 0x7f: + case 0x90 ... 0x91: + case 0xa0 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN2); + break; + default: + goto warn; + } + break; + + case 0x19: + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN3); + break; + case 0x10 ... 0x1f: + case 0x60 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN4); + break; + default: + goto warn; + } + break; + + case 0x1a: + switch (c->x86_model) { + case 0x00 ... 0x2f: + case 0x40 ... 0x4f: + case 0x60 ... 0x7f: + setup_force_cpu_cap(X86_FEATURE_ZEN5); + break; + default: + goto warn; + } + break; + + default: + break; + } + + bsp_determine_snp(c); + return; + +warn: + WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -630,8 +498,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) * SME feature (set in scattered.c). * If the kernel has not enabled SME via any means then * don't advertise the SME feature. - * For SEV: If BIOS has not enabled SEV then don't advertise the - * SEV and SEV_ES feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise SEV and + * any additional functionality based on it. * * In all cases, since support for SME and SEV requires long mode, * don't advertise the feature under CONFIG_X86_32. @@ -666,16 +534,14 @@ clear_all: clear_sev: setup_clear_cpu_cap(X86_FEATURE_SEV); setup_clear_cpu_cap(X86_FEATURE_SEV_ES); + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); } } static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; - early_init_amd_mc(c); - if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -739,34 +605,8 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); - /* - * Check whether the machine is affected by erratum 400. This is - * used to select the proper idle routine and to enable the check - * whether the machine is affected in arch_post_acpi_init(), which - * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. - */ - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_E400); - early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); @@ -814,6 +654,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c) msr_set_bit(MSR_K7_HWCR, 6); #endif set_cpu_bug(c, X86_BUG_SWAPGS_FENCE); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x41 || + (c->x86_model == 0x41 && c->x86_stepping >= 0x2)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_gh(struct cpuinfo_x86 *c) @@ -847,8 +697,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c) */ msr_clear_bit(MSR_AMD64_BU_CFG2, 24); - if (cpu_has_amd_erratum(c, amd_erratum_383)) - set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + + /* + * Check models and steppings affected by erratum 400. This is + * used to select the proper idle routine and to enable the + * check whether the machine is affected in arch_post_acpi_subsys_init() + * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (c->x86_model > 0x2 || + (c->x86_model == 0x2 && c->x86_stepping >= 0x1)) + setup_force_cpu_bug(X86_BUG_AMD_E400); } static void init_amd_ln(struct cpuinfo_x86 *c) @@ -941,9 +800,33 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static const struct x86_cpu_id erratum_1386_microcode[] = { + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), +}; + +static void fix_erratum_1386(struct cpuinfo_x86 *c) +{ + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + * + * Clear the feature flag only on microcode revisions which + * don't have the fix. + */ + if (x86_match_min_microcode_rev(erratum_1386_microcode)) + return; + + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + void init_spectral_chicken(struct cpuinfo_x86 *c) { -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY u64 value; /* @@ -951,34 +834,27 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * * This suppresses speculation from the middle of a basic block, i.e. it * suppresses non-branch predictions. - * - * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H */ - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) { + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif - /* - * Work around Erratum 1386. The XSAVES instruction malfunctions in - * certain circumstances on Zen1/2 uarch, and not all parts have had - * updated microcode at the time of writing (March 2023). - * - * Affected parts all have no supervisor XSAVE states, meaning that - * the XSAVEC instruction (which works fine) is equivalent. - */ - clear_cpu_cap(c, X86_FEATURE_XSAVES); } -static void init_amd_zn(struct cpuinfo_x86 *c) +static void init_amd_zen_common(void) { - set_cpu_cap(c, X86_FEATURE_ZEN); - + setup_force_cpu_cap(X86_FEATURE_ZEN); #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif +} + +static void init_amd_zen1(struct cpuinfo_x86 *c) +{ + fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { @@ -986,15 +862,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c) /* Erratum 1076: CPB feature bit not being set in CPUID. */ if (!cpu_has(c, X86_FEATURE_CPB)) set_cpu_cap(c, X86_FEATURE_CPB); - - /* - * Zen3 (Fam19 model < 0x10) parts are not susceptible to - * Branch Type Confusion, but predate the allocation of the - * BTC_NO bit. - */ - if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) - set_cpu_cap(c, X86_FEATURE_BTC_NO); } + + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); } static bool cpu_has_zenbleed_microcode(void) @@ -1002,11 +873,11 @@ static bool cpu_has_zenbleed_microcode(void) u32 good_rev = 0; switch (boot_cpu_data.x86_model) { - case 0x30 ... 0x3f: good_rev = 0x0830107a; break; - case 0x60 ... 0x67: good_rev = 0x0860010b; break; - case 0x68 ... 0x6f: good_rev = 0x08608105; break; - case 0x70 ... 0x7f: good_rev = 0x08701032; break; - case 0xa0 ... 0xaf: good_rev = 0x08a00008; break; + case 0x30 ... 0x3f: good_rev = 0x0830107b; break; + case 0x60 ... 0x67: good_rev = 0x0860010c; break; + case 0x68 ... 0x6f: good_rev = 0x08608107; break; + case 0x70 ... 0x7f: good_rev = 0x08701033; break; + case 0xa0 ... 0xaf: good_rev = 0x08a00009; break; default: return false; @@ -1018,11 +889,8 @@ static bool cpu_has_zenbleed_microcode(void) return true; } -static void zenbleed_check(struct cpuinfo_x86 *c) +static void zen2_zenbleed_check(struct cpuinfo_x86 *c) { - if (!cpu_has_amd_erratum(c, amd_zenbleed)) - return; - if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return; @@ -1037,6 +905,47 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } +static void init_amd_zen2(struct cpuinfo_x86 *c) +{ + init_spectral_chicken(c); + fix_erratum_1386(c); + zen2_zenbleed_check(c); +} + +static void init_amd_zen3(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { + /* + * Zen3 (Fam19 model < 0x10) parts are not susceptible to + * Branch Type Confusion, but predate the allocation of the + * BTC_NO bit. + */ + if (!cpu_has(c, X86_FEATURE_BTC_NO)) + set_cpu_cap(c, X86_FEATURE_BTC_NO); + } +} + +static void init_amd_zen4(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* + * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE + * in some BIOS versions but they can lead to random host reboots. + */ + switch (c->x86_model) { + case 0x18 ... 0x1f: + case 0x60 ... 0x7f: + clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD); + break; + } +} + +static void init_amd_zen5(struct cpuinfo_x86 *c) +{ +} + static void init_amd(struct cpuinfo_x86 *c) { u64 vm_cr; @@ -1056,9 +965,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_FSRM)) set_cpu_cap(c, X86_FEATURE_FSRS); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); @@ -1072,12 +978,27 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; case 0x16: init_amd_jg(c); break; - case 0x17: init_spectral_chicken(c); - fallthrough; - case 0x19: init_amd_zn(c); break; } /* + * Save up on some future enablement work and do common Zen + * settings. + */ + if (c->x86 >= 0x17) + init_amd_zen_common(); + + if (boot_cpu_has(X86_FEATURE_ZEN1)) + init_amd_zen1(c); + else if (boot_cpu_has(X86_FEATURE_ZEN2)) + init_amd_zen2(c); + else if (boot_cpu_has(X86_FEATURE_ZEN3)) + init_amd_zen3(c); + else if (boot_cpu_has(X86_FEATURE_ZEN4)) + init_amd_zen4(c); + else if (boot_cpu_has(X86_FEATURE_ZEN5)) + init_amd_zen5(c); + + /* * Enable workaround for FXSAVE leak on CPUs * without a XSaveErPtr feature */ @@ -1086,8 +1007,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - amd_detect_cmp(c); - amd_get_topology(c); srat_detect_node(c); init_amd_cacheinfo(c); @@ -1136,7 +1055,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - !cpu_has_amd_erratum(c, amd_erratum_1054)) + (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); @@ -1150,18 +1069,10 @@ static void init_amd(struct cpuinfo_x86 *c) */ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && cpu_has(c, X86_FEATURE_AUTOIBRS)) - WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - - zenbleed_check(c); + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0); - if (cpu_has_amd_erratum(c, amd_div0)) { - pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); - setup_force_cpu_bug(X86_BUG_DIV0); - } - - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && - cpu_has_amd_erratum(c, amd_erratum_1485)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } #ifdef CONFIG_X86_32 @@ -1295,27 +1206,11 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) } EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); -u32 amd_get_highest_perf(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || - (c->x86_model >= 0x70 && c->x86_model < 0x80))) - return 166; - - if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || - (c->x86_model >= 0x40 && c->x86_model < 0x70))) - return 166; - - return 255; -} -EXPORT_SYMBOL_GPL(amd_get_highest_perf); - static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - zenbleed_check(c); + zen2_zenbleed_check(c); } void amd_check_microcode(void) @@ -1323,16 +1218,6 @@ void amd_check_microcode(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return; - on_each_cpu(zenbleed_check_cpu, NULL, 1); -} - -/* - * Issue a DIV 0/1 insn to clear any division data from previous DIV - * operations. - */ -void noinstr amd_clear_divider(void) -{ - asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) - :: "a" (0), "d" (0), "r" (1)); + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) + on_each_cpu(zenbleed_check_cpu, NULL, 1); } -EXPORT_SYMBOL_GPL(amd_clear_divider); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index fdbb5f07448f..f642de2ebdac 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) +#define X86_MATCH(vfm) \ + X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), + X86_MATCH(INTEL_XEON_PHI_KNL), + X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { - X86_MATCH(SKYLAKE_X), + X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), + X86_MATCH(INTEL_ATOM_GOLDMONT), + X86_MATCH(INTEL_ATOM_GOLDMONT_D), + X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; @@ -307,7 +306,7 @@ static void freq_invariance_enable(void) WARN_ON_ONCE(1); return; } - static_branch_enable(&arch_scale_freq_key); + static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -324,8 +323,10 @@ static void __init bp_init_freq_invariance(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (intel_set_max_freq_ratio()) + if (intel_set_max_freq_ratio()) { + guard(cpus_read_lock)(); freq_invariance_enable(); + } } static void disable_freq_invariance_workfn(struct work_struct *work) @@ -346,10 +347,91 @@ static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); + +static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); + +struct arch_hybrid_cpu_scale { + unsigned long capacity; + unsigned long freq_ratio; +}; + +static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; + +/** + * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling + * + * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, + * initialize it and set the static key controlling its code paths. + * + * Must be called before arch_set_cpu_capacity(). + */ +bool arch_enable_hybrid_capacity_scale(void) +{ + int cpu; + + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { + WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); + return true; + } + + arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); + if (!arch_cpu_scale) + return false; + + for_each_possible_cpu(cpu) { + per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; + per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; + } + + static_branch_enable(&arch_hybrid_cap_scale_key); + + pr_info("Hybrid CPU capacity scaling enabled\n"); + + return true; +} + +/** + * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU + * @cpu: Target CPU. + * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. + * @max_cap: System-wide maximum CPU capacity. + * @cap_freq: Frequency of @cpu corresponding to @cap. + * @base_freq: Frequency of @cpu at which MPERF counts. + * + * The units in which @cap and @max_cap are expressed do not matter, so long + * as they are consistent, because the former is effectively divided by the + * latter. Analogously for @cap_freq and @base_freq. + * + * After calling this function for all CPUs, call arch_rebuild_sched_domains() + * to let the scheduler know that capacity-aware scheduling can be used going + * forward. + */ +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq) +{ + if (static_branch_likely(&arch_hybrid_cap_scale_key)) { + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, + div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, + div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); + } else { + WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); + } +} + +unsigned long arch_scale_cpu_capacity(int cpu) +{ + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); + + return SCHED_CAPACITY_SCALE; +} +EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); static void scale_freq_tick(u64 acnt, u64 mcnt) { - u64 freq_scale; + u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; @@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); + else + freq_ratio = arch_max_freq_ratio; + + if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bb0ab8466b91..a5d0998d7604 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,7 +26,7 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> @@ -56,11 +56,13 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); -EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); +EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); +static u64 __ro_after_init x86_arch_cap_msr; + static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; @@ -111,9 +113,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); -/* Control MDS CPU buffer clear before returning to user space */ -DEFINE_STATIC_KEY_FALSE(mds_user_clear); -EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -147,6 +146,8 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + x86_arch_cap_msr = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); @@ -232,7 +233,8 @@ static void x86_amd_ssb_disable(void) #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ -static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static enum mds_mitigations mds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -252,7 +254,7 @@ static void __init mds_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) @@ -292,7 +294,8 @@ enum taa_mitigations { }; /* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -304,8 +307,6 @@ static const char * const taa_strings[] = { static void __init taa_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -344,9 +345,8 @@ static void __init taa_select_mitigation(void) * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ - ia32_cap = x86_read_arch_cap_msr(); - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* @@ -356,7 +356,7 @@ static void __init taa_select_mitigation(void) * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); @@ -393,7 +393,8 @@ enum mmio_mitigations { }; /* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -404,8 +405,6 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { @@ -416,15 +415,20 @@ static void __init mmio_select_mitigation(void) if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) - static_branch_enable(&mds_user_clear); + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + /* + * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based + * mitigations, disable KVM-only mitigation in that case. + */ + if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) + static_branch_disable(&mmio_stale_data_clear); else static_branch_enable(&mmio_stale_data_clear); @@ -433,7 +437,7 @@ static void __init mmio_select_mitigation(void) * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* @@ -443,10 +447,10 @@ static void __init mmio_select_mitigation(void) * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || + if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; @@ -477,6 +481,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str) early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "Register File Data Sampling: " fmt + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; + +static const char * const rfds_strings[] = { + [RFDS_MITIGATION_OFF] = "Vulnerable", + [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", + [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", +}; + +static void __init rfds_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { + rfds_mitigation = RFDS_MITIGATION_OFF; + return; + } + if (rfds_mitigation == RFDS_MITIGATION_OFF) + return; + + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + else + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; +} + +static __init int rfds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + return 0; + + if (!strcmp(str, "off")) + rfds_mitigation = RFDS_MITIGATION_OFF; + else if (!strcmp(str, "on")) + rfds_mitigation = RFDS_MITIGATION_VERW; + + return 0; +} +early_param("reg_file_data_sampling", rfds_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "" fmt static void __init md_clear_update_mitigation(void) @@ -484,12 +539,12 @@ static void __init md_clear_update_mitigation(void) if (cpu_mitigations_off()) return; - if (!static_key_enabled(&mds_user_clear)) + if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) goto out; /* - * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data - * mitigation, if necessary. + * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO + * Stale Data mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { @@ -501,11 +556,19 @@ static void __init md_clear_update_mitigation(void) taa_mitigation = TAA_MITIGATION_VERW; taa_select_mitigation(); } - if (mmio_mitigation == MMIO_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { + /* + * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear + * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. + */ + if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_select_mitigation(); } + if (rfds_mitigation == RFDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_RFDS)) { + rfds_mitigation = RFDS_MITIGATION_VERW; + rfds_select_mitigation(); + } out: if (boot_cpu_has_bug(X86_BUG_MDS)) pr_info("MDS: %s\n", mds_strings[mds_mitigation]); @@ -515,6 +578,8 @@ out: pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) pr_info("MMIO Stale Data: Unknown: No mitigations\n"); + if (boot_cpu_has_bug(X86_BUG_RFDS)) + pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); } static void __init md_clear_select_mitigation(void) @@ -522,11 +587,12 @@ static void __init md_clear_select_mitigation(void) mds_select_mitigation(); taa_select_mitigation(); mmio_select_mitigation(); + rfds_select_mitigation(); /* - * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update - * and print their mitigation after MDS, TAA and MMIO Stale Data - * mitigation selection is done. + * As these mitigations are inter-related and rely on VERW instruction + * to clear the microarchitural buffers, update and print their status + * after mitigation selection is done for each of these vulnerabilities. */ md_clear_update_mitigation(); } @@ -542,7 +608,8 @@ enum srbds_mitigations { SRBDS_MITIGATION_HYPERVISOR, }; -static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; +static enum srbds_mitigations srbds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -593,8 +660,6 @@ void update_srbds_msr(void) static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; @@ -603,8 +668,7 @@ static void __init srbds_select_mitigation(void) * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) @@ -671,11 +735,8 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; -#else -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; -#endif +static enum gds_mitigations gds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -747,7 +808,7 @@ static void __init gds_select_mitigation(void) /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it @@ -811,7 +872,8 @@ enum spectre_v1_mitigation { }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = - SPECTRE_V1_MITIGATION_AUTO; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ? + SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", @@ -926,7 +988,7 @@ static const char * const retbleed_strings[] = { static enum retbleed_mitigation retbleed_mitigation __ro_after_init = RETBLEED_MITIGATION_NONE; static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - RETBLEED_CMD_AUTO; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; static int __ro_after_init retbleed_nosmt = false; @@ -982,10 +1044,10 @@ static void __init retbleed_select_mitigation(void) return; case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { - pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; @@ -994,24 +1056,24 @@ static void __init retbleed_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { - if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) + if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else - pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } @@ -1021,9 +1083,10 @@ do_cmd_auto: case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } @@ -1054,6 +1117,22 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like SRSO has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; case RETBLEED_MITIGATION_STUFF: @@ -1102,7 +1181,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) @@ -1386,17 +1465,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure) static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) @@ -1406,8 +1486,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || @@ -1415,7 +1495,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && - !IS_ENABLED(CONFIG_RETPOLINE)) { + !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1438,7 +1518,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) return SPECTRE_V2_CMD_AUTO; } - if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { + if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; @@ -1469,7 +1549,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { - if (!IS_ENABLED(CONFIG_RETPOLINE)) { + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } @@ -1477,20 +1557,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + if (rrsba_disabled) + return; - if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; return; + } - ia32_cap = x86_read_arch_cap_msr(); + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; - if (ia32_cap & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) @@ -1540,6 +1625,80 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ dump_stack(); } +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_ON, + BHI_MITIGATION_VMEXIT_ONLY, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else if (!strcmp(str, "vmexit")) + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) + return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; + } + + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) + return; + + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { + pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + return; + } + + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1564,7 +1723,7 @@ static void __init spectre_v2_select_mitigation(void) break; } - if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && @@ -1651,6 +1810,9 @@ static void __init spectre_v2_select_mitigation(void) mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); + if (boot_cpu_has(X86_BUG_BHI)) + bhi_select_mitigation(); + spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -1765,8 +1927,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1781,7 +1941,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -1880,10 +2040,12 @@ static const struct { static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { - enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + enum ssb_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? + SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; @@ -1891,7 +2053,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) - return SPEC_STORE_BYPASS_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) @@ -1902,8 +2064,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPEC_STORE_BYPASS_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } } @@ -2230,7 +2392,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ -enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +enum l1tf_mitigations l1tf_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2256,20 +2419,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c) if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL: - case INTEL_FAM6_HASWELL_L: - case INTEL_FAM6_HASWELL_G: - case INTEL_FAM6_BROADWELL: - case INTEL_FAM6_BROADWELL_G: - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: + switch (c->x86_vfm) { + case INTEL_NEHALEM: + case INTEL_WESTMERE: + case INTEL_SANDYBRIDGE: + case INTEL_IVYBRIDGE: + case INTEL_HASWELL: + case INTEL_HASWELL_L: + case INTEL_HASWELL_G: + case INTEL_BROADWELL: + case INTEL_BROADWELL_G: + case INTEL_SKYLAKE_L: + case INTEL_SKYLAKE: + case INTEL_KABYLAKE_L: + case INTEL_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -2410,10 +2573,9 @@ static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has_bug(X86_BUG_SRSO)) { + if (!boot_cpu_has_bug(X86_BUG_SRSO) || + cpu_mitigations_off() || + srso_cmd == SRSO_CMD_OFF) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; @@ -2444,11 +2606,6 @@ static void __init srso_select_mitigation(void) } switch (srso_cmd) { - case SRSO_CMD_OFF: - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - return; - case SRSO_CMD_MICROCODE: if (has_microcode) { srso_mitigation = SRSO_MITIGATION_MICROCODE; @@ -2457,7 +2614,10 @@ static void __init srso_select_mitigation(void) break; case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + goto ibpb_on_vmexit; + + if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. @@ -2477,30 +2637,56 @@ static void __init srso_select_mitigation(void) else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); } break; case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { - pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); } break; +ibpb_on_vmexit: case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_CPU_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { + if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { + if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); } } else { - pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); - } + pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + } + break; + default: break; } @@ -2615,6 +2801,11 @@ static ssize_t mmio_stale_data_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t rfds_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2623,15 +2814,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2640,10 +2831,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2653,14 +2844,32 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } +static const char *spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && + rrsba_disabled) + return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; + + return "; BHI: Vulnerable"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2673,13 +2882,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } @@ -2774,6 +2985,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_GDS: return gds_show_state(buf); + case X86_BUG_RFDS: + return rfds_show_state(buf); + default: break; } @@ -2848,4 +3062,14 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_GDS); } + +ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); +} #endif + +void __warn_thunk(void) +{ + WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n"); +} diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c new file mode 100644 index 000000000000..6cba85c79d42 --- /dev/null +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +#include <linux/semaphore.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/cpuhotplug.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> +#include <asm/traps.h> +#include <asm/cpu.h> + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static const struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrl(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrl(MSR_TEST_CTRL, test_ctrl_val); +} + +void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (sysctl_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; + } else { + work = &sl_reenable; + } + + cpu = get_cpu(); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index c131c412db89..a6c6bccfa8b8 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -178,8 +178,6 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -static unsigned short num_cache_leaves; - /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: L2 not shared, no SMT etc. that is currently true on AMD CPUs. @@ -301,7 +299,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + eax->split.num_cores_on_die = topology_num_cores_per_package(); if (assoc == 0xffff) @@ -595,7 +593,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) if (index < 3) return; - node = topology_die_id(smp_processor_id()); + node = topology_amd_node_id(smp_processor_id()); this_leaf->nb = node_to_amd_nb(node); if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) amd_calc_l3_indices(this_leaf->nb); @@ -661,7 +659,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { /* * We may have multiple LLCs if L3 caches exist, so check if we @@ -672,7 +670,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c) if (c->x86 < 0x17) { /* LLC is at the node level. */ - c->topo.llc_id = c->topo.die_id; + c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. @@ -717,20 +715,23 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) void init_amd_cacheinfo(struct cpuinfo_x86 *c) { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - num_cache_leaves = find_num_cache_leaves(c); + ci->num_leaves = find_num_cache_leaves(c); } else if (c->extended_cpuid_level >= 0x80000006) { if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; + ci->num_leaves = 4; else - num_cache_leaves = 3; + ci->num_leaves = 3; } } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) { - num_cache_leaves = find_num_cache_leaves(c); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + + ci->num_leaves = find_num_cache_leaves(c); } void init_intel_cacheinfo(struct cpuinfo_x86 *c) @@ -740,21 +741,21 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); if (c->cpuid_level > 3) { - static int is_initialized; - - if (is_initialized == 0) { - /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(c); - is_initialized++; - } + /* + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been initialized. + */ + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); /* * Whenever possible use cpuid(4), deterministic cache * parameters cpuid leaf to find the cache details */ - for (i = 0; i < num_cache_leaves; i++) { + for (i = 0; i < ci->num_leaves; i++) { struct _cpuid4_info_regs this_leaf = {}; int retval; @@ -790,14 +791,14 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for * trace cache */ - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; int only_trace = 0; - if (num_cache_leaves != 0 && c->x86 == 15) + if (ci->num_leaves && c->x86 == 15) only_trace = 1; /* Number of times to iterate */ @@ -807,7 +808,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; @@ -991,14 +992,12 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, int init_cache_level(unsigned int cpu) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - if (!num_cache_leaves) + /* There should be at least one leaf. */ + if (!ci->num_leaves) return -ENOENT; - if (!this_cpu_ci) - return -EINVAL; - this_cpu_ci->num_levels = 3; - this_cpu_ci->num_leaves = num_cache_leaves; + return 0; } @@ -1118,15 +1117,16 @@ static void cache_cpu_init(void) unsigned long flags; local_irq_save(flags); - cache_disable(); - if (memory_caching_control & CACHE_MTRR) + if (memory_caching_control & CACHE_MTRR) { + cache_disable(); mtrr_generic_set_state(); + cache_enable(); + } if (memory_caching_control & CACHE_PAT) pat_cpu_init(); - cache_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 345f7d905db6..a3b55db35c96 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c) #endif early_init_centaur(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b14fc8c1c953..7cce91b19fb2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,6 +29,7 @@ #include <asm/alternative.h> #include <asm/cmdline.h> +#include <asm/cpuid.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -61,19 +62,38 @@ #include <asm/microcode.h> #include <asm/intel-family.h> #include <asm/cpu_device_id.h> +#include <asm/fred.h> #include <asm/uv/uv.h> #include <asm/ia32.h> #include <asm/set_memory.h> #include <asm/traps.h> #include <asm/sev.h> +#include <asm/tdx.h> +#include <asm/posted_intr.h> +#include <asm/runtime-const.h> #include "cpu.h" +DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + u32 elf_hwcap2 __read_mostly; /* Number of siblings per CPU package */ -int smp_num_siblings = 1; -EXPORT_SYMBOL(smp_num_siblings); +unsigned int __max_threads_per_core __ro_after_init = 1; +EXPORT_SYMBOL(__max_threads_per_core); + +unsigned int __max_dies_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__max_dies_per_package); + +unsigned int __max_logical_packages __ro_after_init = 1; +EXPORT_SYMBOL(__max_logical_packages); + +unsigned int __num_cores_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_cores_per_package); + +unsigned int __num_threads_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_threads_per_package); static struct ppin_info { int feature; @@ -97,17 +117,17 @@ static const struct x86_cpu_id ppin_cpuids[] = { X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), /* Legacy models without CPUID enumeration */ - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), {} }; @@ -150,7 +170,7 @@ static void ppin_init(struct cpuinfo_x86 *c) } clear_ppin: - clear_cpu_cap(c, info->feature); + setup_clear_cpu_cap(info->feature); } static void default_init(struct cpuinfo_x86 *c) @@ -188,45 +208,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), - [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff), + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), - /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), - - [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), - [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff), #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -265,21 +277,13 @@ static int __init x86_noinvpcid_setup(char *s) } early_param("noinvpcid", x86_noinvpcid_setup); -#ifdef CONFIG_X86_32 -static int cachesize_override = -1; -static int disable_x86_serial_nr = 1; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - /* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) +static inline bool flag_is_changeable_p(unsigned long flag) { - u32 f1, f2; + unsigned long f1, f2; + + if (!IS_ENABLED(CONFIG_X86_32)) + return true; /* * Cyrix and IDT cpus allow disabling of CPUID @@ -302,11 +306,22 @@ static inline int flag_is_changeable_p(u32 flag) : "=&r" (f1), "=&r" (f2) : "ir" (flag)); - return ((f1^f2) & flag) != 0; + return (f1 ^ f2) & flag; } +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + /* Probe for the CPUID instruction */ -int have_cpuid_p(void) +bool have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -338,10 +353,6 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else -static inline int flag_is_changeable_p(u32 flag) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -389,9 +400,8 @@ out: } /* These bits should not change their value after CPU init is finished. */ -static const unsigned long cr4_pinned_mask = - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | - X86_CR4_FSGSBASE | X86_CR4_CET; +static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | + X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -627,9 +637,9 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { - { X86_FEATURE_MWAIT, 0x00000005 }, - { X86_FEATURE_DCA, 0x00000009 }, - { X86_FEATURE_XSAVE, 0x0000000d }, + { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT }, + { X86_FEATURE_DCA, CPUID_LEAF_DCA }, + { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE }, { 0, 0 } }; @@ -797,19 +807,6 @@ static void get_model_name(struct cpuinfo_x86 *c) *(s + 1) = '\0'; } -void detect_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - c->x86_max_cores = 1; - if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4) - return; - - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - c->x86_max_cores = (eax >> 26) + 1; -} - void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -871,52 +868,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); } -int detect_ht_early(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - - if (!cpu_has(c, X86_FEATURE_HT)) - return -1; - - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return -1; - - if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return -1; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - smp_num_siblings = (ebx & 0xff0000) >> 16; - if (smp_num_siblings == 1) - pr_info_once("CPU0: Hyper-Threading is disabled\n"); -#endif - return 0; -} - -void detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - int index_msb, core_bits; - - if (detect_ht_early(c) < 0) - return; - - index_msb = get_count_order(smp_num_siblings); - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif -} - -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1103,18 +1055,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - bool vp_bits_from_cpuid = true; if (!cpu_has(c, X86_FEATURE_CPUID) || - (c->extended_cpuid_level < 0x80000008)) - vp_bits_from_cpuid = false; - - if (vp_bits_from_cpuid) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } else { + (c->extended_cpuid_level < 0x80000008)) { if (IS_ENABLED(CONFIG_X86_64)) { c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1128,14 +1071,23 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; } + } else { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } + c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_32 int i; /* @@ -1156,7 +1108,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) break; } } -#endif } #define NO_SPECULATION BIT(0) @@ -1170,12 +1121,13 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) -#define VULNWL_INTEL(model, whitelist) \ - VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) +#define VULNWL_INTEL(vfm, whitelist) \ + X86_MATCH_VFM(vfm, whitelist) #define VULNWL_AMD(family, whitelist) \ VULNWL(AMD, family, X86_MODEL_ANY, whitelist) @@ -1192,32 +1144,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(TIGERLAKE, NO_MMIO), - VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), - VULNWL_INTEL(ALDERLAKE, NO_MMIO), - VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO), - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(CORE_YONAH, NO_SSB), + VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1227,33 +1179,31 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; #define VULNBL(vendor, family, model, blacklist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) -#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, steppings, \ - X86_FEATURE_ANY, issues) +#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \ + X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues) #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1274,42 +1224,54 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define SRSO BIT(5) /* CPU is affected by GDS */ #define GDS BIT(6) +/* CPU is affected by Register File Data Sampling */ +#define RFDS BIT(7) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), VULNBL_AMD(0x19, SRSO), + VULNBL_AMD(0x1a, SRSO), {} }; @@ -1322,28 +1284,46 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); +} + +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) + return false; + + /* + * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to + * indicate that mitigation is needed because guest is running on a + * vulnerable hardware or may migrate to such hardware: + */ + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + return true; + + /* Only consult the blacklist when there is no enumeration: */ + return cpu_matches(cpu_vuln_blacklist, RFDS); } static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1355,23 +1335,28 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); /* * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature * flag and protect from vendor-specific bugs via the whitelist. + * + * Don't use AutoIBRS when SNP is enabled because it degrades host + * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) { + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || + (cpu_has(c, X86_FEATURE_AUTOIBRS) && + !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1390,9 +1375,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1418,7 +1403,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) @@ -1426,7 +1411,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1444,15 +1429,28 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); + if (vulnerable_to_rfds(x86_arch_cap_msr)) + setup_force_cpu_bug(X86_BUG_RFDS); + + /* When virtualized, eIBRS could be hidden, assume vulnerable */ + if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && + !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + + if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) + setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); @@ -1515,6 +1513,11 @@ static void __init cpu_parse_early_param(void) if (cmdline_find_option_bool(boot_command_line, "nousershstk")) setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + /* Minimize the gap between FRED is available and available but disabled. */ + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); + if (arglen != 2 || strncmp(arg, "on", 2)) + setup_clear_cpu_cap(X86_FEATURE_FRED); + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen <= 0) return; @@ -1594,10 +1597,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (have_cpuid_p()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); cpu_parse_early_param(); + cpu_init_topology(c); + if (this_cpu->c_early_init) this_cpu->c_early_init(c); @@ -1608,10 +1615,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_bsp_init(c); } else { setup_clear_cpu_cap(X86_FEATURE_CPUID); + get_cpu_address_sizes(c); + cpu_init_topology(c); } - get_cpu_address_sizes(c); - setup_force_cpu_cap(X86_FEATURE_ALWAYS); cpu_set_bug_bits(c); @@ -1644,15 +1651,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) detect_nopl(); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1660,20 +1663,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} +void __init early_cpu_init(void) +{ #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); #endif + + init_cpu_devs(); + +#ifdef CONFIG_PROCESSOR_SELECT + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); + } } +#endif + early_identify_cpu(&boot_cpu_data); } @@ -1750,23 +1763,11 @@ static void generic_identify(struct cpuinfo_x86 *c) cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); - if (c->cpuid_level >= 0x00000001) { - c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_32 -# ifdef CONFIG_SMP - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -# else - c->topo.apicid = c->topo.initial_apicid; -# endif -#endif - c->topo.pkg_id = c->topo.initial_apicid; - } - get_model_name(c); /* Default name */ /* @@ -1788,29 +1789,6 @@ static void generic_identify(struct cpuinfo_x86 *c) } /* - * Validate that ACPI/mptables have the same information about the - * effective APIC id and update the package map. - */ -static void validate_apic_and_package_id(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int cpu = smp_processor_id(); - u32 apicid; - - apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid != c->topo.apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", - cpu, apicid, c->topo.initial_apicid); - } - BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu)); - BUG_ON(topology_update_die_map(c->topo.die_id, cpu)); -#else - c->topo.logical_pkg_id = 0; -#endif -} - -/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1823,11 +1801,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86_model = c->x86_stepping = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->topo.cu_id = 0xff; - c->topo.llc_id = BAD_APICID; - c->topo.l2c_id = BAD_APICID; #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1846,15 +1819,19 @@ static void identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); + cpu_parse_topology(c); + if (this_cpu->c_identify) this_cpu->c_identify(c); /* Clear/Set all flags overridden by options, after probe */ apply_forced_caps(c); -#ifdef CONFIG_X86_64 - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); -#endif + /* + * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and + * Hygon will clear it in ->c_init() below. + */ + set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); /* * Vendor-specific initialization. In this section we @@ -1869,6 +1846,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_init) this_cpu->c_init(c); + bus_lock_init(); + /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1903,10 +1882,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } -#ifdef CONFIG_X86_64 - detect_ht(c); -#endif - x86_init_rdrand(c); setup_pku(c); setup_cet(c); @@ -1938,11 +1913,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); - select_idle_routine(c); - -#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); -#endif } /* @@ -1987,6 +1958,7 @@ static __init void identify_boot_cpu(void) setup_cr_pinning(); tsx_init(); + tdx_init(); lkgs_init(); } @@ -1997,7 +1969,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_32 enable_sep_cpu(); #endif - validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); update_srbds_msr(); if (boot_cpu_has_bug(X86_BUG_GDS)) @@ -2049,6 +2020,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .top_of_stack = TOP_OF_INIT_STACK, }; EXPORT_PER_CPU_SYMBOL(pcpu_hot); +EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); #ifdef CONFIG_X86_64 DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, @@ -2066,10 +2038,8 @@ static void wrmsrl_cstar(unsigned long val) wrmsrl(MSR_CSTAR, val); } -/* May not be marked __init: used by software suspend */ -void syscall_init(void) +static inline void idt_syscall_init(void) { - wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { @@ -2103,12 +2073,31 @@ void syscall_init(void) X86_EFLAGS_AC|X86_EFLAGS_ID); } +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* The default user and kernel segments */ + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); + + /* + * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and + * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED + * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit + * instruction to return to ring 3 (both sysexit and sysret cause + * #UD when FRED is enabled). + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_syscall_init(); +} + #else /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +#ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif +#endif #endif /* CONFIG_X86_64 */ @@ -2198,7 +2187,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) * Setup everything needed to handle exceptions from the IDT, including the IST * exceptions which use paranoid_entry(). */ -void cpu_init_exception_handling(void) +void cpu_init_exception_handling(bool boot_cpu) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); int cpu = raw_smp_processor_id(); @@ -2206,8 +2195,9 @@ void cpu_init_exception_handling(void) /* paranoid_entry() gets the CPU number from the GDT */ setup_getcpu(cpu); - /* IST vectors need TSS to be set up. */ - tss_setup_ist(tss); + /* For IDT mode, IST vectors need to be set in TSS. */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + tss_setup_ist(tss); tss_setup_io_bitmap(tss); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); @@ -2216,8 +2206,23 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); - /* Finally load the IDT */ - load_current_idt(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* The boot CPU has enabled FRED during early boot */ + if (!boot_cpu) + cpu_init_fred_exceptions(); + + cpu_init_fred_rsps(); + } else { + load_current_idt(); + } +} + +void __init cpu_init_replace_early_idt(void) +{ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + cpu_init_fred_exceptions(); + else + idt_setup_early_pf(); } /* @@ -2252,6 +2257,8 @@ void cpu_init(void) barrier(); x2apic_setup(); + + intel_posted_msi_init(); } mmgrab(&init_mm); @@ -2340,13 +2347,17 @@ void arch_smt_update(void) void __init arch_cpu_finalize_init(void) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + identify_boot_cpu(); + select_idle_routine(); + /* * identify_boot_cpu() initialized SMT support information, let the * core code know. */ - cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings); + cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core); if (!IS_ENABLED(CONFIG_SMP)) { pr_info("CPU: "); @@ -2376,9 +2387,25 @@ void __init arch_cpu_finalize_init(void) fpu__init_system(); fpu__init_cpu(); + /* + * Ensure that access to the per CPU representation has the initial + * boot CPU configuration. + */ + *c = boot_cpu_data; + c->initialized = true; + alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { + unsigned long USER_PTR_MAX = TASK_SIZE_MAX; + + /* + * Enable this when LAM is gated on LASS support + if (cpu_feature_enabled(X86_FEATURE_LAM)) + USER_PTR_MAX = (1ul << 63) - PAGE_SIZE; + */ + runtime_const_init(ptr, USER_PTR_MAX); + /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 885281ae79a5..1beccefbaff9 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -2,6 +2,11 @@ #ifndef ARCH_X86_CPU_H #define ARCH_X86_CPU_H +#include <asm/cpu.h> +#include <asm/topology.h> + +#include "topology.h" + /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -56,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); @@ -71,14 +78,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); -extern void detect_num_cpu_cores(struct cpuinfo_x86 *c); -extern int detect_extended_topology_early(struct cpuinfo_x86 *c); -extern int detect_extended_topology(struct cpuinfo_x86 *c); -extern int detect_ht_early(struct cpuinfo_x86 *c); -extern void detect_ht(struct cpuinfo_x86 *c); extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); -void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c); +void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); @@ -96,4 +98,5 @@ static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_EIBRS_LFENCE; } + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index e462c1d3800a..df838e3bdbe0 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -44,7 +44,11 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +60,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, @@ -82,6 +83,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, + { X86_FEATURE_FRED, X86_FEATURE_LKGS }, {} }; @@ -112,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) if (WARN_ON(feature >= MAX_FEATURE_BITS)) return; + if (boot_cpu_has(feature)) + WARN_ON(alternatives_patched); + clear_feature(c, feature); /* Collect all features to disable, handling dependencies */ diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1..dfec2c61e354 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 0c179d684b3b..cacfd3f6abef 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -5,6 +5,8 @@ #include <asm/apic.h> #include <asm/processor.h> +#include "cpu.h" + static int cpu_debug_show(struct seq_file *m, void *p) { unsigned long cpu = (unsigned long)m->private; @@ -20,13 +22,18 @@ static int cpu_debug_show(struct seq_file *m, void *p) seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); - seq_printf(m, "max_cores: %u\n", c->x86_max_cores); - seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package); - seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); + seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg()); + seq_printf(m, "num_threads: %u\n", __num_threads_per_package); + seq_printf(m, "num_cores: %u\n", __num_cores_per_package); + seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package); + seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core); return 0; } @@ -42,12 +49,48 @@ static const struct file_operations dfs_cpu_ops = { .release = single_release, }; +static int dom_debug_show(struct seq_file *m, void *p) +{ + static const char *domain_names[TOPO_MAX_DOMAIN] = { + [TOPO_SMT_DOMAIN] = "Thread", + [TOPO_CORE_DOMAIN] = "Core", + [TOPO_MODULE_DOMAIN] = "Module", + [TOPO_TILE_DOMAIN] = "Tile", + [TOPO_DIE_DOMAIN] = "Die", + [TOPO_DIEGRP_DOMAIN] = "DieGrp", + [TOPO_PKG_DOMAIN] = "Package", + }; + unsigned int dom, nthreads = 1; + + for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) { + nthreads *= x86_topo_system.dom_size[dom]; + seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n", + domain_names[dom], x86_topo_system.dom_shifts[dom], + x86_topo_system.dom_size[dom], nthreads); + } + return 0; +} + +static int dom_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, dom_debug_show, inode->i_private); +} + +static const struct file_operations dfs_dom_ops = { + .open = dom_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static __init int cpu_init_debugfs(void) { struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir); unsigned long id; char name[24]; + debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops); + dir = debugfs_create_dir("cpus", base); for_each_possible_cpu(id) { sprintf(name, "%lu", id); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 03851240c3e3..4a4118784c13 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); if (ept & VMX_EPT_1GB_PAGE_BIT) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + if (ept & VMX_EPT_PAGE_WALK_5_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL); /* Synthetic APIC features that are aggregates of multiple features. */ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && @@ -186,7 +188,7 @@ update_caps: update_sgx: if (!(msr & FEAT_CTL_SGX_ENABLED)) { if (enable_sgx_kvm || enable_sgx_driver) - pr_err_once("SGX disabled by BIOS.\n"); + pr_err_once("SGX disabled or unsupported by BIOS.\n"); clear_cpu_cap(c, X86_FEATURE_SGX); return; } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6f247d66758d..c5191b06f9f2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -18,14 +18,6 @@ #include "cpu.h" -#define APICID_SOCKET_ID_BIT 6 - -/* - * nodes_per_socket: Stores the number of nodes per socket. - * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] - */ -static u32 nodes_per_socket = 1; - #ifdef CONFIG_NUMA /* * To workaround broken NUMA config. Read the comment in @@ -49,80 +41,6 @@ static int nearby_node(int apicid) } #endif -static void hygon_get_topology_early(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) - smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1; -} - -/* - * Fixup core topology information for - * (1) Hygon multi-node processors - * Assumption: Number of cores in each internal node is the same. - * (2) Hygon processors supporting compute units - */ -static void hygon_get_topology(struct cpuinfo_x86 *c) -{ - /* get information required for multi-node processors */ - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - int err; - u32 eax, ebx, ecx, edx; - - cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); - - c->topo.die_id = ecx & 0xff; - - c->topo.core_id = ebx & 0xff; - - if (smp_num_siblings > 1) - c->x86_max_cores /= smp_num_siblings; - - /* - * In case leaf B is available, use it to derive - * topology information. - */ - err = detect_extended_topology(c); - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - - /* - * Socket ID is ApicId[6] for the processors with model <= 0x3 - * when running on host. - */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3) - c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT; - - cacheinfo_hygon_init_llc_id(c); - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - c->topo.die_id = value & 7; - c->topo.llc_id = c->topo.die_id; - } else - return; - - if (nodes_per_socket > 1) - set_cpu_cap(c, X86_FEATURE_AMD_DCM); -} - -/* - * On Hygon setup the lower bits of the APIC id distinguish the cores. - * Assumes number of cores is a power of two. - */ -static void hygon_detect_cmp(struct cpuinfo_x86 *c) -{ - unsigned int bits; - - bits = c->x86_coreid_bits; - /* Low order bits define the core id (index of core in socket) */ - c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->topo.pkg_id = c->topo.initial_apicid >> bits; - /* Use package ID also for last level cache */ - c->topo.llc_id = c->topo.die_id = c->topo.pkg_id; -} - static void srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -173,32 +91,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void early_init_hygon_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - static void bsp_init_hygon(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -212,18 +104,6 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_MWAITX)) use_mwaitx_delay(); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - u32 ecx; - - ecx = cpuid_ecx(0x8000001e); - __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1; - } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - u64 value; - - rdmsrl(MSR_FAM10H_NODE_ID, value); - __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1; - } - if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) && !boot_cpu_has(X86_FEATURE_VIRT_SSBD)) { /* @@ -242,8 +122,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) { u32 dummy; - early_init_hygon_mc(c); - set_cpu_cap(c, X86_FEATURE_K8); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); @@ -284,8 +162,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c) * we can set it unconditionally. */ set_cpu_cap(c, X86_FEATURE_VMMCALL); - - hygon_get_topology_early(c); } static void init_hygon(struct cpuinfo_x86 *c) @@ -302,9 +178,6 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - /* get apicid instead of initial apic id from cpuid */ - c->topo.apicid = read_apic_id(); - /* * XXX someone from Hygon needs to confirm this DTRT * @@ -316,8 +189,6 @@ static void init_hygon(struct cpuinfo_x86 *c) cpu_detect_cache_sizes(c); - hygon_detect_cmp(c); - hygon_get_topology(c); srat_detect_node(c); init_hygon_cacheinfo(c); @@ -354,6 +225,9 @@ static void init_hygon(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); check_null_seg_clears_base(c); + + /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); } static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a927a8fc9624..134368a3f4b1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,13 +7,9 @@ #include <linux/smp.h> #include <linux/sched.h> #include <linux/sched/clock.h> -#include <linux/semaphore.h> #include <linux/thread_info.h> #include <linux/init.h> #include <linux/uaccess.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpuhotplug.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -24,8 +20,6 @@ #include <asm/hwcap2.h> #include <asm/elf.h> #include <asm/cpu_device_id.h> -#include <asm/cmdline.h> -#include <asm/traps.h> #include <asm/resctrl.h> #include <asm/numa.h> #include <asm/thermal.h> @@ -41,28 +35,6 @@ #include <asm/apic.h> #endif -enum split_lock_detect_state { - sld_off = 0, - sld_warn, - sld_fatal, - sld_ratelimit, -}; - -/* - * Default to sld_off because most systems do not support split lock detection. - * sld_state_setup() will switch this to sld_warn on systems that support - * split lock/bus lock detect, unless there is a command line override. - */ -static enum split_lock_detect_state sld_state __ro_after_init = sld_off; -static u64 msr_test_ctrl_cache __ro_after_init; - -/* - * With a name like MSR_TEST_CTL it should go without saying, but don't touch - * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it - * on CPUs that do not support SLD can cause fireworks, even when writing '0'. - */ -static bool cpu_model_supports_sld __ro_after_init; - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -72,19 +44,19 @@ static bool cpu_model_supports_sld __ro_after_init; */ static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) { - switch (c->x86_model) { - case INTEL_FAM6_CORE_YONAH: - case INTEL_FAM6_CORE2_MEROM: - case INTEL_FAM6_CORE2_MEROM_L: - case INTEL_FAM6_CORE2_PENRYN: - case INTEL_FAM6_CORE2_DUNNINGTON: - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_NEHALEM_G: - case INTEL_FAM6_NEHALEM_EP: - case INTEL_FAM6_NEHALEM_EX: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_WESTMERE_EP: - case INTEL_FAM6_SANDYBRIDGE: + switch (c->x86_vfm) { + case INTEL_CORE_YONAH: + case INTEL_CORE2_MEROM: + case INTEL_CORE2_MEROM_L: + case INTEL_CORE2_PENRYN: + case INTEL_CORE2_DUNNINGTON: + case INTEL_NEHALEM: + case INTEL_NEHALEM_G: + case INTEL_NEHALEM_EP: + case INTEL_NEHALEM_EX: + case INTEL_WESTMERE: + case INTEL_WESTMERE_EP: + case INTEL_SANDYBRIDGE: setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); } } @@ -106,9 +78,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) */ if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: + switch (c->x86_vfm) { + case INTEL_XEON_PHI_KNL: + case INTEL_XEON_PHI_KNM: break; default: return; @@ -134,32 +106,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) * - Release note from 20180108 microcode release */ struct sku_microcode { - u8 model; + u32 vfm; u8 stepping; u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, - { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, - { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, - { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL, 0x03, 0x23 }, - { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, - { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, - { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + { INTEL_KABYLAKE, 0x0B, 0x80 }, + { INTEL_KABYLAKE, 0x0A, 0x80 }, + { INTEL_KABYLAKE, 0x09, 0x80 }, + { INTEL_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_KABYLAKE_L, 0x09, 0x80 }, + { INTEL_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_BROADWELL, 0x04, 0x28 }, + { INTEL_BROADWELL_G, 0x01, 0x1b }, + { INTEL_BROADWELL_D, 0x02, 0x14 }, + { INTEL_BROADWELL_D, 0x03, 0x07000011 }, + { INTEL_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_HASWELL_L, 0x01, 0x21 }, + { INTEL_HASWELL_G, 0x01, 0x18 }, + { INTEL_HASWELL, 0x03, 0x23 }, + { INTEL_HASWELL_X, 0x02, 0x3b }, + { INTEL_HASWELL_X, 0x04, 0x10 }, + { INTEL_IVYBRIDGE_X, 0x04, 0x42a }, /* Observed in the wild */ - { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, - { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, + { INTEL_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_SANDYBRIDGE_X, 0x07, 0x712 }, }; static bool bad_spectre_microcode(struct cpuinfo_x86 *c) @@ -173,29 +145,70 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return false; - if (c->x86 != 6) - return false; - for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { - if (c->x86_model == spectre_bad_microcodes[i].model && + if (c->x86_vfm == spectre_bad_microcodes[i].vfm && c->x86_stepping == spectre_bad_microcodes[i].stepping) return (c->microcode <= spectre_bad_microcodes[i].microcode); } return false; } -static void early_init_intel(struct cpuinfo_x86 *c) +#define MSR_IA32_TME_ACTIVATE 0x982 + +/* Helpers to access TME_ACTIVATE MSR */ +#define TME_ACTIVATE_LOCKED(x) (x & 0x1) +#define TME_ACTIVATE_ENABLED(x) (x & 0x2) + +#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ + +static void detect_tme_early(struct cpuinfo_x86 *c) { - u64 misc_enable; + u64 tme_activate; + int keyid_bits; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } + rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + + if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { + pr_info_once("x86/tme: not enabled by BIOS\n"); + clear_cpu_cap(c, X86_FEATURE_TME); + return; } + pr_info_once("x86/tme: enabled by BIOS\n"); + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + if (!keyid_bits) + return; + + /* + * KeyID bits are set by BIOS and can be present regardless + * of whether the kernel is using them. They effectively lower + * the number of physical address bits. + * + * Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; + pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n", + keyid_bits); +} + +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; + + if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd)) + return; + + /* + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. + */ + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); +} + +static void early_init_intel(struct cpuinfo_x86 *c) +{ + u64 misc_enable; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) @@ -228,7 +241,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && + if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -260,30 +273,28 @@ static void early_init_intel(struct cpuinfo_x86 *c) } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ - if (c->x86 == 6) { - switch (c->x86_model) { - case INTEL_FAM6_ATOM_SALTWELL_MID: - case INTEL_FAM6_ATOM_SALTWELL_TABLET: - case INTEL_FAM6_ATOM_SILVERMONT_MID: - case INTEL_FAM6_ATOM_AIRMONT_NP: - set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); - break; - default: - break; - } + switch (c->x86_vfm) { + case INTEL_ATOM_SALTWELL_MID: + case INTEL_ATOM_SALTWELL_TABLET: + case INTEL_ATOM_SILVERMONT_MID: + case INTEL_ATOM_AIRMONT_NP: + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC + * + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* @@ -309,7 +320,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ - if (c->x86 == 5 && c->x86_model == 9) { + if (c->x86_vfm == INTEL_QUARK_X1000) { pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } @@ -317,11 +328,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) check_memory_type_self_snoop_errata(c); /* - * Get the number of SMT siblings early from the extended topology - * leaf, if available. Otherwise try the legacy SMT detection. + * Adjust the number of physical bits early because it affects the + * valid bits of the MTRR mask registers. */ - if (detect_extended_topology_early(c) < 0) - detect_ht_early(c); + if (cpu_has(c, X86_FEATURE_TME)) + detect_tme_early(c); } static void bsp_init_intel(struct cpuinfo_x86 *c) @@ -482,90 +493,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } -#define MSR_IA32_TME_ACTIVATE 0x982 - -/* Helpers to access TME_ACTIVATE MSR */ -#define TME_ACTIVATE_LOCKED(x) (x & 0x1) -#define TME_ACTIVATE_ENABLED(x) (x & 0x2) - -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - -#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ - -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - -static void detect_tme(struct cpuinfo_x86 *c) -{ - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); - - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } - - if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { - pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; - return; - } - - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); - - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } - - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } - - /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. - */ - c->x86_phys_bits -= keyid_bits; -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -594,33 +521,12 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); } -static void split_lock_init(void); -static void bus_lock_init(void); - static void init_intel(struct cpuinfo_x86 *c) { early_init_intel(c); intel_workarounds(c); - /* - * Detect the extended topology information if available. This - * will reinitialise the initial_apicid which will be used - * in init_intel_cacheinfo() - */ - detect_extended_topology(c); - - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -643,12 +549,15 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && - (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + if (boot_cpu_has(X86_FEATURE_CLFLUSH) && + (c->x86_vfm == INTEL_CORE2_DUNNINGTON || + c->x86_vfm == INTEL_NEHALEM_EX || + c->x86_vfm == INTEL_WESTMERE_EX)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && - ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + if (boot_cpu_has(X86_FEATURE_MWAIT) && + (c->x86_vfm == INTEL_ATOM_GOLDMONT || + c->x86_vfm == INTEL_LUNARLAKE_M)) set_cpu_bug(c, X86_BUG_MONITOR); #ifdef CONFIG_X86_64 @@ -690,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); } - - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); #endif /* Work around errata */ @@ -702,13 +606,9 @@ static void init_intel(struct cpuinfo_x86 *c) init_ia32_feat_ctl(c); - if (cpu_has(c, X86_FEATURE_TME)) - detect_tme(c); - init_intel_misc_features(c); split_lock_init(); - bus_lock_init(); intel_init_thermal(c); } @@ -735,26 +635,37 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 -#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G_2M_4M 0x17 -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of + * entries for their respective TLB types. The 0x63 descriptor is an + * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries + * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for + * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the + * intel_tlb_table[] mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, @@ -776,7 +687,8 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" + " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, @@ -876,6 +788,12 @@ static void intel_tlb_lookup(const unsigned char desc) if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; break; + case TLB_DATA_1G_2M_4M: + if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + fallthrough; case TLB_DATA_1G: if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; @@ -899,7 +817,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; @@ -974,381 +892,6 @@ static const struct cpu_dev intel_cpu_dev = { cpu_dev_register(intel_cpu_dev); -#undef pr_fmt -#define pr_fmt(fmt) "x86/split lock detection: " fmt - -static const struct { - const char *option; - enum split_lock_detect_state state; -} sld_options[] __initconst = { - { "off", sld_off }, - { "warn", sld_warn }, - { "fatal", sld_fatal }, - { "ratelimit:", sld_ratelimit }, -}; - -static struct ratelimit_state bld_ratelimit; - -static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem, 1); - -#ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { - { - .procname = "split_lock_mitigate", - .data = &sysctl_sld_mitigate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init sld_mitigate_sysctl_init(void) -{ - register_sysctl_init("kernel", sld_sysctls); - return 0; -} - -late_initcall(sld_mitigate_sysctl_init); -#endif - -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt), ratelimit; - - if (strncmp(arg, opt, len)) - return false; - - /* - * Min ratelimit is 1 bus lock/sec. - * Max ratelimit is 1000 bus locks/sec. - */ - if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && - ratelimit > 0 && ratelimit <= 1000) { - ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); - ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); - return true; - } - - return len == arglen; -} - -static bool split_lock_verify_msr(bool on) -{ - u64 ctrl, tmp; - - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) - return false; - if (on) - ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - else - ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) - return false; - rdmsrl(MSR_TEST_CTRL, tmp); - return ctrl == tmp; -} - -static void __init sld_state_setup(void) -{ - enum split_lock_detect_state state = sld_warn; - char arg[20]; - int i, ret; - - if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - ret = cmdline_find_option(boot_command_line, "split_lock_detect", - arg, sizeof(arg)); - if (ret >= 0) { - for (i = 0; i < ARRAY_SIZE(sld_options); i++) { - if (match_option(arg, ret, sld_options[i].option)) { - state = sld_options[i].state; - break; - } - } - } - sld_state = state; -} - -static void __init __split_lock_setup(void) -{ - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - if (!split_lock_verify_msr(true)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); -} - -/* - * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking - * is not implemented as one thread could undo the setting of the other - * thread immediately after dropping the lock anyway. - */ -static void sld_update_msr(bool on) -{ - u64 test_ctrl_val = msr_test_ctrl_cache; - - if (on) - test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); -} - -static void split_lock_init(void) -{ - /* - * #DB for bus lock handles ratelimit and #AC for split lock is - * disabled. - */ - if (sld_state == sld_ratelimit) { - split_lock_verify_msr(false); - return; - } - - if (cpu_model_supports_sld) - split_lock_verify_msr(sld_state != sld_off); -} - -static void __split_lock_reenable_unlock(struct work_struct *work) -{ - sld_update_msr(true); - up(&buslock_sem); -} - -static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); - -static void __split_lock_reenable(struct work_struct *work) -{ - sld_update_msr(true); -} -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); - -/* - * If a CPU goes offline with pending delayed work to re-enable split lock - * detection then the delayed work will be executed on some other CPU. That - * handles releasing the buslock_sem, but because it executes on a - * different CPU probably won't re-enable split lock detection. This is a - * problem on HT systems since the sibling CPU on the same core may then be - * left running with split lock detection disabled. - * - * Unconditionally re-enable detection here. - */ -static int splitlock_cpu_offline(unsigned int cpu) -{ - sld_update_msr(true); - - return 0; -} - -static void split_lock_warn(unsigned long ip) -{ - struct delayed_work *work; - int cpu; - - if (!current->reported_split_lock) - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); - current->reported_split_lock = 1; - - if (sysctl_sld_mitigate) { - /* - * misery factor #1: - * sleep 10ms before trying to execute split lock. - */ - if (msleep_interruptible(10) > 0) - return; - /* - * Misery factor #2: - * only allow one buslocked disabled core at a time. - */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; - } - - cpu = get_cpu(); - schedule_delayed_work_on(cpu, work, 2); - - /* Disable split lock detection on this CPU to make progress */ - sld_update_msr(false); - put_cpu(); -} - -bool handle_guest_split_lock(unsigned long ip) -{ - if (sld_state == sld_warn) { - split_lock_warn(ip); - return true; - } - - pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", - current->comm, current->pid, - sld_state == sld_fatal ? "fatal" : "bogus", ip); - - current->thread.error_code = 0; - current->thread.trap_nr = X86_TRAP_AC; - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - return false; -} -EXPORT_SYMBOL_GPL(handle_guest_split_lock); - -static void bus_lock_init(void) -{ - u64 val; - - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - - if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) { - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; - } else { - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; - } - - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); -} - -bool handle_user_split_lock(struct pt_regs *regs, long error_code) -{ - if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) - return false; - split_lock_warn(regs->ip); - return true; -} - -void handle_bus_lock(struct pt_regs *regs) -{ - switch (sld_state) { - case sld_off: - break; - case sld_ratelimit: - /* Enforce no more than bld_ratelimit bus locks/sec. */ - while (!__ratelimit(&bld_ratelimit)) - msleep(20); - /* Warn on the bus lock. */ - fallthrough; - case sld_warn: - pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", - current->comm, current->pid, regs->ip); - break; - case sld_fatal: - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - break; - } -} - -/* - * CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - */ -static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), - {} -}; - -static void __init split_lock_setup(struct cpuinfo_x86 *c) -{ - const struct x86_cpu_id *m; - u64 ia32_core_caps; - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; - - /* Check for CPUs that have support but do not enumerate it: */ - m = x86_match_cpu(split_lock_cpu_ids); - if (m) - goto supported; - - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - - /* - * Not all bits in MSR_IA32_CORE_CAPS are architectural, but - * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set - * it have split lock detection. - */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - goto supported; - - /* CPU is not in the model list and does not have the MSR bit: */ - return; - -supported: - cpu_model_supports_sld = true; - __split_lock_setup(); -} - -static void sld_state_show(void) -{ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) - return; - - switch (sld_state) { - case sld_off: - pr_info("disabled\n"); - break; - case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "x86/splitlock", NULL, splitlock_cpu_offline) < 0) - pr_warn("No splitlock CPU offline handler\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: warning on user-space bus_locks\n"); - } - break; - case sld_fatal: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", - boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? - " from non-WB" : ""); - } - break; - case sld_ratelimit: - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); - break; - } -} - -void __init sld_setup(struct cpuinfo_x86 *c) -{ - split_lock_setup(c); - sld_state_setup(); - sld_state_show(); -} - #define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 /** @@ -1364,3 +907,18 @@ u8 get_this_hybrid_cpu_type(void) return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; } + +/** + * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU + * + * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. + * If the processor is not hybrid, returns 0. + */ +u32 get_this_hybrid_cpu_native_id(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return 0; + + return cpuid_eax(0x0000001a) & + (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); +} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index e4c3ba91321c..30b1d63b97f3 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu) } static const struct x86_cpu_id intel_epb_normal[] = { - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), {} }; @@ -237,4 +237,4 @@ err_out_online: cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); return ret; } -subsys_initcall(intel_epb_init); +late_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c deleted file mode 100644 index 0771a905b286..000000000000 --- a/arch/x86/kernel/cpu/intel_pconfig.c +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Intel PCONFIG instruction support. - * - * Copyright (C) 2017 Intel Corporation - * - * Author: - * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> - */ - -#include <asm/cpufeature.h> -#include <asm/intel_pconfig.h> - -#define PCONFIG_CPUID 0x1b - -#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) - -/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ -enum { - PCONFIG_CPUID_SUBLEAF_INVALID = 0, - PCONFIG_CPUID_SUBLEAF_TARGETID = 1, -}; - -/* Bitmask of supported targets */ -static u64 targets_supported __read_mostly; - -int pconfig_target_supported(enum pconfig_target target) -{ - /* - * We would need to re-think the implementation once we get > 64 - * PCONFIG targets. Spec allows up to 2^32 targets. - */ - BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); - - if (WARN_ON_ONCE(target >= 64)) - return 0; - return targets_supported & (1ULL << target); -} - -static int __init intel_pconfig_init(void) -{ - int subleaf; - - if (!boot_cpu_has(X86_FEATURE_PCONFIG)) - return 0; - - /* - * Scan subleafs of PCONFIG CPUID leaf. - * - * Subleafs of the same type need not to be consecutive. - * - * Stop on the first invalid subleaf type. All subleafs after the first - * invalid are invalid too. - */ - for (subleaf = 0; subleaf < INT_MAX; subleaf++) { - struct cpuid_regs regs; - - cpuid_count(PCONFIG_CPUID, subleaf, - ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); - - switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { - case PCONFIG_CPUID_SUBLEAF_INVALID: - /* Stop on the first invalid subleaf */ - goto out; - case PCONFIG_CPUID_SUBLEAF_TARGETID: - /* Mark supported PCONFIG targets */ - if (regs.ebx < 64) - targets_supported |= (1ULL << regs.ebx); - if (regs.ecx < 64) - targets_supported |= (1ULL << regs.ecx); - if (regs.edx < 64) - targets_supported |= (1ULL << regs.edx); - break; - default: - /* Unknown CPUID.PCONFIG subleaf: ignore */ - break; - } - } -out: - return 0; -} -arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index ad6776081e60..4f3c65429f82 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -6,7 +6,7 @@ #include <linux/slab.h> /** - * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * x86_match_cpu - match current CPU against an array of x86_cpu_ids * @match: Pointer to array of x86_cpu_ids. Last entry terminated with * {}. * @@ -17,8 +17,7 @@ * * A typical table entry would be to match a specific CPU * - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, - * X86_FEATURE_ANY, NULL); + * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) @@ -26,7 +25,7 @@ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts * for various common selections. The above can be shortened to: * - * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * X86_MATCH_VFM(INTEL_BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) @@ -39,9 +38,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; - m->vendor | m->family | m->model | m->steppings | m->feature; - m++) { + for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) @@ -59,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) } EXPORT_SYMBOL(x86_match_cpu); -static const struct x86_cpu_desc * -x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) +bool x86_match_min_microcode_rev(const struct x86_cpu_id *table) { - struct cpuinfo_x86 *c = &boot_cpu_data; - const struct x86_cpu_desc *m; - - for (m = match; m->x86_family | m->x86_model; m++) { - if (c->x86_vendor != m->x86_vendor) - continue; - if (c->x86 != m->x86_family) - continue; - if (c->x86_model != m->x86_model) - continue; - if (c->x86_stepping != m->x86_stepping) - continue; - return m; - } - return NULL; -} - -bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) -{ - const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + const struct x86_cpu_id *res = x86_match_cpu(table); - if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + if (!res || res->driver_data > boot_cpu_data.microcode) return false; return true; } -EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); +EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index f3517b8a8e91..1075a90141da 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov <bp@alien8.de> - * - * All MC4_MISCi registers are shared between cores on a node. */ #include <linux/interrupt.h> #include <linux/notifier.h> @@ -20,7 +18,6 @@ #include <linux/smp.h> #include <linux/string.h> -#include <asm/amd_nb.h> #include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> @@ -87,42 +84,40 @@ struct smca_bank { static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks); static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts); -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -static struct smca_bank_name smca_names[] = { - [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" }, - [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, - [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, - [SMCA_DE] = { "decode_unit", "Decode Unit" }, - [SMCA_RESERVED] = { "reserved", "Reserved" }, - [SMCA_EX] = { "execution_unit", "Execution Unit" }, - [SMCA_FP] = { "floating_point", "Floating Point Unit" }, - [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, - [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, - [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, +static const char * const smca_names[] = { + [SMCA_LS ... SMCA_LS_V2] = "load_store", + [SMCA_IF] = "insn_fetch", + [SMCA_L2_CACHE] = "l2_cache", + [SMCA_DE] = "decode_unit", + [SMCA_RESERVED] = "reserved", + [SMCA_EX] = "execution_unit", + [SMCA_FP] = "floating_point", + [SMCA_L3_CACHE] = "l3_cache", + [SMCA_CS ... SMCA_CS_V2] = "coherent_slave", + [SMCA_PIE] = "pie", /* UMC v2 is separate because both of them can exist in a single system. */ - [SMCA_UMC] = { "umc", "Unified Memory Controller" }, - [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" }, - [SMCA_PB] = { "param_block", "Parameter Block" }, - [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, - [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" }, - [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, - [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" }, - [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, - [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" }, - [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" }, - [SMCA_NBIF] = { "nbif", "NBIF Unit" }, - [SMCA_SHUB] = { "shub", "System Hub Unit" }, - [SMCA_SATA] = { "sata", "SATA Unit" }, - [SMCA_USB] = { "usb", "USB Unit" }, - [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" }, - [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" }, - [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" }, - [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" }, + [SMCA_UMC] = "umc", + [SMCA_UMC_V2] = "umc_v2", + [SMCA_MA_LLC] = "ma_llc", + [SMCA_PB] = "param_block", + [SMCA_PSP ... SMCA_PSP_V2] = "psp", + [SMCA_SMU ... SMCA_SMU_V2] = "smu", + [SMCA_MP5] = "mp5", + [SMCA_MPDMA] = "mpdma", + [SMCA_NBIO] = "nbio", + [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie", + [SMCA_XGMI_PCS] = "xgmi_pcs", + [SMCA_NBIF] = "nbif", + [SMCA_SHUB] = "shub", + [SMCA_SATA] = "sata", + [SMCA_USB] = "usb", + [SMCA_USR_DP] = "usr_dp", + [SMCA_USR_CP] = "usr_cp", + [SMCA_GMI_PCS] = "gmi_pcs", + [SMCA_XGMI_PHY] = "xgmi_phy", + [SMCA_WAFL_PHY] = "wafl_phy", + [SMCA_GMI_PHY] = "gmi_phy", }; static const char *smca_get_name(enum smca_bank_types t) @@ -130,17 +125,8 @@ static const char *smca_get_name(enum smca_bank_types t) if (t >= N_SMCA_BANK_TYPES) return NULL; - return smca_names[t].name; -} - -const char *smca_get_long_name(enum smca_bank_types t) -{ - if (t >= N_SMCA_BANK_TYPES) - return NULL; - - return smca_names[t].long_name; + return smca_names[t]; } -EXPORT_SYMBOL_GPL(smca_get_long_name); enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank) { @@ -178,6 +164,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_CS, HWID_MCATYPE(0x2E, 0x0) }, { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) }, { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) }, + { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) }, /* Unified Memory Controller MCA type */ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0) }, @@ -212,6 +199,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) }, { SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) }, { SMCA_USB, HWID_MCATYPE(0xAA, 0x0) }, + { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) }, + { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) }, { SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) }, { SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) }, { SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) }, @@ -229,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -341,19 +356,6 @@ struct thresh_restart { u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -389,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) return msr_high_bits & BIT(28); } -static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -397,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } if (apic != msr) { @@ -407,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) * was set is reserved. Return early here: */ if (mce_flags.smca) - return 0; + return false; pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } - return 1; + return true; }; /* Reprogram MCx_MISC MSR behind this threshold bank. */ @@ -786,29 +788,33 @@ bool amd_mce_usable_address(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_setup(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) { + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) @@ -1202,35 +1208,10 @@ out_free: return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1238,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_die_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1271,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } - err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1314,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank) kobject_put(&bank->blocks->kobj); } -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - static void threshold_remove_bank(struct threshold_bank *bank) { - struct amd_northbridge *nb; - if (!bank->blocks) goto out_free; - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_die_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: deallocate_threshold_blocks(bank); out_free: diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7f7309ff67d0..0a89947e47bc 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,30 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); - unsigned int cpu; - struct mce m; + unsigned int cpu, num_regs; + bool apicid_found = false; + struct mce_hw_err err; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -85,41 +89,86 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) return -EINVAL; /* - * The register array size must be large enough to include all the - * SMCA registers which need to be extracted. - * * The number of registers in the register array is determined by * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. - * The register layout is fixed and currently the raw data in the - * register array includes 6 SMCA registers which the kernel can - * extract. + * Sanity-check registers array size. */ - if (ctx_info->reg_arr_size < 48) + num_regs = ctx_info->reg_arr_size >> 3; + if (!num_regs) return -EINVAL; - mce_setup(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); - /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + if (!apicid_found) + return -EINVAL; + + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); + + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; + + /* + * The SMCA register layout is fixed and includes 16 registers. + * The end of the array may be variable, but the beginning is known. + * Cap the number of registers to expected max (15). + */ + if (num_regs > 15) + num_regs = 15; + + switch (num_regs) { + /* MCA_SYND2 */ + case 15: + err.vendor.amd.synd2 = *(i_mce + 14); + fallthrough; + /* MCA_SYND1 */ + case 14: + err.vendor.amd.synd1 = *(i_mce + 13); + fallthrough; + /* MCA_MISC4 */ + case 13: + /* MCA_MISC3 */ + case 12: + /* MCA_MISC2 */ + case 11: + /* MCA_MISC1 */ + case 10: + /* MCA_DEADDR */ + case 9: + /* MCA_DESTAT */ + case 8: + /* reserved */ + case 7: + /* MCA_SYND */ + case 6: + m->synd = *(i_mce + 5); + fallthrough; + /* MCA_IPID */ + case 5: + m->ipid = *(i_mce + 4); + fallthrough; + /* MCA_CONFIG */ + case 4: + /* MCA_MISC0 */ + case 3: + m->misc = *(i_mce + 2); + fallthrough; + /* MCA_ADDR */ + case 2: + m->addr = *(i_mce + 1); + fallthrough; + /* MCA_STATUS */ + case 1: + m->status = *i_mce; + } - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 7b397370b4d6..0dc00c9894c7 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -44,14 +44,17 @@ #include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> +#include <linux/kexec.h> -#include <asm/intel-family.h> +#include <asm/fred.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/reboot.h> +#include <asm/tdx.h> #include "internal.h" @@ -85,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -114,28 +117,41 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -156,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -184,6 +202,10 @@ static void __print_mce(struct mce *m) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -199,9 +221,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -228,11 +252,20 @@ static void wait_for_panic(void) panic("Panicing machine check CPU died"); } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static const char *mce_dump_aux_info(struct mce *m) +{ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) + return tdx_dump_mce_info(m); + + return NULL; +} + +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; int apei_err = 0; + const char *memmsg; /* * Allow instrumentation around external facilities usage. Not that it @@ -258,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -279,13 +314,33 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); + + memmsg = mce_dump_aux_info(&final->m); + if (memmsg) + pr_emerg(HW_ERR "Machine check: %s\n", memmsg); + if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mca_cfg.panic_timeout; + + /* + * Kdump skips the poisoned page in order to avoid + * touching the error bits again. Poison the page even + * if the error is fatal and the machine is about to + * panic. + */ + if (kexec_crash_loaded()) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { + struct page *p; + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } + } panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -401,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(err); instrumentation_end(); + m = &err->m; m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); if (regs) { /* @@ -435,10 +492,10 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) } } -int mce_available(struct cpuinfo_x86 *c) +bool mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) - return 0; + return false; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -530,13 +587,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable); static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -580,13 +637,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -600,8 +657,10 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); @@ -623,8 +682,11 @@ static noinstr void mce_read_aux(struct mce *m, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) + if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } @@ -645,40 +707,51 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - bool error_seen = false; - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + + /* + * Update storm tracking here, before checking for the + * MCI_STATUS_VAL bit. Valid corrected errors count + * towards declaring, or maintaining, storm status. No + * error in a bank counts towards avoiding, or ending, + * storm status. + */ + if (!mca_cfg.cmci_disabled) + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -688,20 +761,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -712,25 +785,23 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: - error_seen = true; - if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* @@ -745,8 +816,6 @@ clear_it: */ sync_core(); - - return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -856,9 +925,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; @@ -876,7 +946,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -967,10 +1037,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -978,11 +1049,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -994,7 +1067,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1011,11 +1084,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1219,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1270,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1280,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1350,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1365,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1418,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1457,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1472,15 +1551,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1491,12 +1570,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1511,7 +1590,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1523,8 +1602,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1542,15 +1621,33 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); + + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from @@ -1561,13 +1658,13 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: @@ -1601,13 +1698,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static unsigned long mce_adjust_timer_default(unsigned long interval) -{ - return interval; -} - -static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; - static void __start_timer(struct timer_list *t, unsigned long interval) { unsigned long when = jiffies + interval; @@ -1637,15 +1727,9 @@ static void mce_timer_fn(struct timer_list *t) iv = __this_cpu_read(mce_next_interval); - if (mce_available(this_cpu_ptr(&cpu_info))) { + if (mce_available(this_cpu_ptr(&cpu_info))) mc_poll_banks(); - if (mce_intel_cmci_poll()) { - iv = mce_adjust_timer(iv); - goto done; - } - } - /* * Alert userspace if needed. If we logged an MCE, reduce the polling * interval, otherwise increase the polling interval. @@ -1655,23 +1739,29 @@ static void mce_timer_fn(struct timer_list *t) else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); -done: - __this_cpu_write(mce_next_interval, iv); - __start_timer(t, iv); + if (mce_get_storm_mode()) { + __start_timer(t, HZ); + } else { + __this_cpu_write(mce_next_interval, iv); + __start_timer(t, iv); + } } /* - * Ensure that the timer is firing in @interval from now. + * When a storm starts on any bank on this CPU, switch to polling + * once per second. When the storm ends, revert to the default + * polling interval. */ -void mce_timer_kick(unsigned long interval) +void mce_timer_kick(bool storm) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); - __start_timer(t, interval); + mce_set_storm_mode(storm); - if (interval < iv) - __this_cpu_write(mce_next_interval, interval); + if (storm) + __start_timer(t, HZ); + else + __this_cpu_write(mce_next_interval, check_interval * HZ); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1688,7 +1778,7 @@ static void mce_timer_delete_all(void) * Can be called from interrupt context, but not from machine check/NMI * context. */ -int mce_notify_irq(void) +bool mce_notify_irq(void) { /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); @@ -1699,9 +1789,9 @@ int mce_notify_irq(void) if (__ratelimit(&ratelimit)) pr_info(HW_ERR "Machine check events logged\n"); - return 1; + return true; } - return 0; + return false; } EXPORT_SYMBOL_GPL(mce_notify_irq); @@ -1820,101 +1910,120 @@ static void __mcheck_cpu_check_banks(void) } } -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void apply_quirks_amd(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* - * Various K7s with broken bank 0 around. Always disable - * by default. + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; + mca_cfg.bootlog = 0; + } - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; - } + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; +static void apply_quirks_intel(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + mce_banks[0].init = false; - if (c->x86 == 6 && c->x86_model == 45) - mce_flags.snb_ifu_quirk = 1; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; + + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} + +static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; } +} - if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } +/* Add per CPU specific workarounds here */ +static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + switch (c->x86_vendor) { + case X86_VENDOR_UNKNOWN: + pr_info("unknown CPU type - not enabling MCE support\n"); + return false; + case X86_VENDOR_AMD: + apply_quirks_amd(c); + break; + case X86_VENDOR_INTEL: + apply_quirks_intel(c); + break; + case X86_VENDOR_ZHAOXIN: + apply_quirks_zhaoxin(c); + break; } if (cfg->monarch_timeout < 0) @@ -1922,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (cfg->bootlog != 0) cfg->panic_timeout = 30; - return 0; + return true; } -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return 0; + return false; switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); mce_flags.p5 = 1; - return 1; + return true; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); mce_flags.winchip = 1; - return 1; + return true; default: - return 0; + return false; } - return 0; + return false; } /* @@ -1995,7 +2104,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) intel_init_cmci(); intel_init_lmce(); - mce_adjust_timer = cmci_intel_adjust_timer; } static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) @@ -2008,16 +2116,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = cmci_intel_adjust_timer; - break; - - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); break; - } + case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); + mce_amd_feature_init(c); break; case X86_VENDOR_CENTAUR: @@ -2134,6 +2237,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) exc_machine_check_user(regs); local_db_restore(dr7); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #MCE needs to be handled on different stack: User #MCE + * on current task stack, while kernel #MCE on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #MCE dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry + * stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_MCE(exc_machine_check) +{ + unsigned long dr7; + + dr7 = local_db_save(); + if (user_mode(regs)) + exc_machine_check_user(regs); + else + exc_machine_check_kernel(regs); + local_db_restore(dr7); +} +#endif #else /* 32bit unified entry point */ DEFINE_IDTENTRY_RAW(exc_machine_check) @@ -2166,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { + if (!__mcheck_cpu_apply_quirks(c)) { mca_cfg.disabled = 1; return; } - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; @@ -2399,7 +2527,7 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct bus_type mce_subsys = { +static const struct bus_type mce_subsys = { .name = "machinecheck", .dev_name = "machinecheck", }; @@ -2442,12 +2570,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } @@ -2568,9 +2698,6 @@ static int mce_device_create(unsigned int cpu) int err; int i, j; - if (!mce_available(&boot_cpu_data)) - return -EIO; - dev = per_cpu(mce_device, cpu); if (dev) return 0; @@ -2665,8 +2792,6 @@ static void mce_reenable_cpu(void) static int mce_cpu_dead(unsigned int cpu) { - mce_intel_hcpu_update(cpu); - /* intentionally ignoring frozen here */ if (!cpuhp_tasks_frozen) cmci_rediscover(); diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index a05ac0716ecf..8d023239ce18 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: return put_user(mcelog->len, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog->flags; - } while (cmpxchg(&mcelog->flags, flags, 0) != flags); - - return put_user(flags, p); - } + case MCE_GETCLEAR_FLAGS: + return put_user(xchg(&mcelog->flags, 0), p); default: return -ENOTTY; } @@ -314,7 +307,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, /* * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * so do it a jiffy or two later everywhere. */ schedule_timeout(2); @@ -331,7 +324,6 @@ static const struct file_operations mce_chrdev_ops = { .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, .compat_ioctl = compat_ptr_ioctl, - .llseek = no_llseek, }; static struct miscdevice mce_chrdev_device = { diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index fbe8b61c3413..3ca9c007a666 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -16,14 +16,14 @@ * used to save error information organized in a lock-less list. * * This memory pool is only to be used to save MCE records in MCE context. - * MCE events are rare, so a fixed size memory pool should be enough. Use - * 2 pages to save MCE events for now (~80 MCE records at most). + * MCE events are rare, so a fixed size memory pool should be enough. + * Allocate on a sliding scale based on number of CPUs. */ -#define MCE_POOLSZ (2 * PAGE_SIZE) +#define MCE_MIN_ENTRIES 80 +#define MCE_PER_CPU 2 static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); -static char gen_pool_buf[MCE_POOLSZ]; /* * Compare the record "t" with each of the records on list "l" to see if @@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ]; */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) - return -EINVAL; + if (filter_mce(&err->m)) + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { - struct gen_pool *tmpp; - int ret = -ENOMEM; - - tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); - if (!tmpp) - goto out; + int mce_numrecords, mce_poolsz, order; + struct gen_pool *gpool; + void *mce_pool; + + order = order_base_2(sizeof(struct mce_evt_llist)); + gpool = gen_pool_create(order, -1); + if (!gpool) + return false; + + mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); + mce_poolsz = mce_numrecords * (1 << order); + mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); + if (!mce_pool) { + gen_pool_destroy(gpool); + return false; + } - ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); - if (ret) { - gen_pool_destroy(tmpp); - goto out; + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { + gen_pool_destroy(gpool); + kfree(mce_pool); + return false; } - mce_evt_pool = tmpp; + mce_evt_pool = gpool; -out: - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 4d8d4bcf915d..313fe682db33 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -430,11 +430,9 @@ static void trigger_thr_int(void *info) static u32 get_nbc_for_node(int node_id) { - struct cpuinfo_x86 *c = &boot_cpu_data; u32 cores_per_node; - cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket(); - + cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg(); return cores_per_node * node_id; } @@ -489,19 +487,24 @@ static void prepare_msrs(void *info) wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + + if (m.misc) + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + + if (m.misc) + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); } } static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -515,7 +518,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } @@ -543,8 +547,8 @@ static void do_inject(void) if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { - toggle_nb_mca_mst_cpu(topology_die_id(cpu)); - cpu = get_nbc_for_node(topology_die_id(cpu)); + toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu)); + cpu = get_nbc_for_node(topology_amd_node_id(cpu)); } cpus_read_lock(); @@ -746,6 +750,7 @@ static void check_hw_inj_possible(void) wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; @@ -796,4 +801,5 @@ static void __exit inject_exit(void) module_init(inject_init); module_exit(inject_exit); +MODULE_DESCRIPTION("Machine check injection support"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 52bce533ddcc..f863df0ff42c 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -13,7 +13,7 @@ #include <linux/cpumask.h> #include <asm/apic.h> #include <asm/cpufeature.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -42,15 +42,6 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* - * CMCI storm detection backoff counter - * - * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've - * encountered an error. If not, we decrement it by one. We signal the end of - * the CMCI storm when it reaches 0. - */ -static DEFINE_PER_CPU(int, cmci_backoff_cnt); - -/* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ @@ -63,29 +54,33 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); */ static DEFINE_SPINLOCK(cmci_poll_lock); +/* Linux non-storm CMCI threshold (may be overridden by BIOS) */ #define CMCI_THRESHOLD 1 -#define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (HZ) -#define CMCI_STORM_THRESHOLD 15 -static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); -static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); -static DEFINE_PER_CPU(unsigned int, cmci_storm_state); - -enum { - CMCI_STORM_NONE, - CMCI_STORM_ACTIVE, - CMCI_STORM_SUBSIDED, -}; +/* + * MCi_CTL2 threshold for each bank when there is no storm. + * Default value for each bank may have been set by BIOS. + */ +static u16 cmci_threshold[MAX_NR_BANKS]; -static atomic_t cmci_storm_on_cpus; +/* + * High threshold to limit CMCI rate during storms. Max supported is + * 0x7FFF. Use this slightly smaller value so it has a distinctive + * signature when some asks "Why am I not seeing all corrected errors?" + * A high threshold is used instead of just disabling CMCI for a + * bank because both corrected and uncorrected errors may be logged + * in the same bank and signalled with CMCI. The threshold only applies + * to corrected errors, so keeping CMCI enabled means that uncorrected + * errors will still be processed in a timely fashion. + */ +#define CMCI_STORM_THRESHOLD 32749 -static int cmci_supported(int *banks) +static bool cmci_supported(int *banks) { u64 cap; if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) - return 0; + return false; /* * Vendor check is not strictly needed, but the initial @@ -94,12 +89,13 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 0; + return false; if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) - return 0; + return false; + rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -134,204 +130,166 @@ static bool lmce_supported(void) return tmp & FEAT_CTL_LMCE_ENABLED; } -bool mce_intel_cmci_poll(void) +/* + * Set a new CMCI threshold value. Preserve the state of the + * MCI_CTL2_CMCI_EN bit in case this happens during a + * cmci_rediscover() operation. + */ +static void cmci_set_threshold(int bank, int thresh) { - if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return false; - - /* - * Reset the counter if we've logged an error in the last poll - * during the storm. - */ - if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); - else - this_cpu_dec(cmci_backoff_cnt); + unsigned long flags; + u64 val; - return true; + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -void mce_intel_hcpu_update(unsigned long cpu) +void mce_intel_handle_storm(int bank, bool on) { - if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) - atomic_dec(&cmci_storm_on_cpus); + if (on) + cmci_set_threshold(bank, CMCI_STORM_THRESHOLD); + else + cmci_set_threshold(bank, cmci_threshold[bank]); +} - per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); } -static void cmci_toggle_interrupt_mode(bool on) +/* + * Check all the reasons why current CPU cannot claim + * ownership of a bank. + * 1: CPU already owns this bank + * 2: BIOS owns this bank + * 3: Some other CPU owns this bank + */ +static bool cmci_skip_bank(int bank, u64 *val) { - unsigned long flags, *owned; - int bank; - u64 val; + unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + if (test_bit(bank, owned)) + return true; - if (on) - val |= MCI_CTL2_CMCI_EN; - else - val &= ~MCI_CTL2_CMCI_EN; + /* Skip banks in firmware first mode */ + if (test_bit(bank, mce_banks_ce_disabled)) + return true; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} + rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); -unsigned long cmci_intel_adjust_timer(unsigned long interval) -{ - if ((this_cpu_read(cmci_backoff_cnt) > 0) && - (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { - mce_notify_irq(); - return CMCI_STORM_INTERVAL; + /* Already owned by someone else? */ + if (*val & MCI_CTL2_CMCI_EN) { + clear_bit(bank, owned); + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + return true; } - switch (__this_cpu_read(cmci_storm_state)) { - case CMCI_STORM_ACTIVE: - - /* - * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the timer - * interval is back to our poll interval. - */ - __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - if (!atomic_sub_return(1, &cmci_storm_on_cpus)) - pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + return false; +} - fallthrough; +/* + * Decide which CMCI interrupt threshold to use: + * 1: If this bank is in storm mode from whichever CPU was + * the previous owner, stay in storm mode. + * 2: If ignoring any threshold set by BIOS, set Linux default + * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero). + */ +static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh) +{ + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + return val; - case CMCI_STORM_SUBSIDED: + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { /* - * We wait for all CPUs to go back to SUBSIDED state. When that - * happens we switch back to interrupt mode. + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. */ - if (!atomic_read(&cmci_storm_on_cpus)) { - __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_toggle_interrupt_mode(true); - cmci_recheck(); - } - return CMCI_POLL_INTERVAL; - default: - - /* We have shiny weather. Let the poll do whatever it thinks. */ - return interval; + *bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; } + + return val; } -static bool cmci_storm_detect(void) +/* + * Try to claim ownership of a bank. + */ +static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh) { - unsigned int cnt = __this_cpu_read(cmci_storm_cnt); - unsigned long ts = __this_cpu_read(cmci_time_stamp); - unsigned long now = jiffies; - int r; + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); - if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) - return true; + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { - cnt++; - } else { - cnt = 1; - __this_cpu_write(cmci_time_stamp, now); + /* If the enable bit did not stick, this bank should be polled. */ + if (!(val & MCI_CTL2_CMCI_EN)) { + WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks))); + storm->banks[bank].poll_only = true; + return; } - __this_cpu_write(cmci_storm_cnt, cnt); - - if (cnt <= CMCI_STORM_THRESHOLD) - return false; - cmci_toggle_interrupt_mode(false); - __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); - r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_STORM_INTERVAL); - this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + /* This CPU successfully set the enable bit. */ + set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned)); - if (r == 1) - pr_notice("CMCI storm detected: switching to poll mode\n"); - return true; -} + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) { + pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank); + mce_inherit_storm(bank); + cmci_storm_begin(bank); + } else { + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + } -/* - * The interrupt handler. This is called on every event. - * Just call the poller directly to log any events. - * This could in theory increase the threshold under high load, - * but doesn't for now. - */ -static void intel_threshold_interrupt(void) -{ - if (cmci_storm_detect()) - return; + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + *bios_wrong_thresh = 1; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + /* Save default threshold for each bank */ + if (cmci_threshold[bank] == 0) + cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK; } /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared - * banks. + * banks. Called during initial bootstrap, and also for hotplug CPU operations + * to rediscover/reassign machine check banks. */ static void cmci_discover(int banks) { - unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned); + int bios_wrong_thresh = 0; unsigned long flags; int i; - int bios_wrong_thresh = 0; raw_spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; - if (test_bit(i, owned)) + if (cmci_skip_bank(i, &val)) continue; - /* Skip banks in firmware first mode */ - if (test_bit(i, mce_banks_ce_disabled)) - continue; - - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Already owned by someone else? */ - if (val & MCI_CTL2_CMCI_EN) { - clear_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - continue; - } - - if (!mca_cfg.bios_cmci_threshold) { - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= CMCI_THRESHOLD; - } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { - /* - * If bios_cmci_threshold boot option was specified - * but the threshold is zero, we'll try to initialize - * it to 1. - */ - bios_zero_thresh = 1; - val |= CMCI_THRESHOLD; - } - - val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - - /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & MCI_CTL2_CMCI_EN) { - set_bit(i, owned); - __clear_bit(i, this_cpu_ptr(mce_poll_banks)); - /* - * We are able to set thresholds for some banks that - * had a threshold of 0. This means the BIOS has not - * set the thresholds properly or does not work with - * this boot option. Note down now and report later. - */ - if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && - (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) - bios_wrong_thresh = 1; - } else { - WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks))); - } + val = cmci_pick_threshold(val, &bios_zero_thresh); + cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh); } raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { @@ -370,6 +328,9 @@ static void __cmci_disable_bank(int bank) val &= ~MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); + + if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) + cmci_storm_end(bank); } /* @@ -495,10 +456,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) { u64 error_control; - switch (c->x86_model) { - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: + switch (c->x86_vfm) { + case INTEL_SANDYBRIDGE_X: + case INTEL_IVYBRIDGE_X: + case INTEL_HASWELL_X: if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; @@ -524,12 +485,11 @@ bool intel_filter_mce(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ - if ((c->x86 == 6) && - ((c->x86_model == INTEL_FAM6_HASWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_L) || - (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G) || - (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + if ((c->x86_vfm == INTEL_HASWELL || + c->x86_vfm == INTEL_HASWELL_L || + c->x86_vfm == INTEL_BROADWELL || + c->x86_vfm == INTEL_HASWELL_G || + c->x86_vfm == INTEL_SKYLAKE_X) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index e13a26c9c0ac..95a504ece43e 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); @@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void); extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long cmci_intel_adjust_timer(unsigned long interval); -bool mce_intel_cmci_poll(void); -void mce_intel_hcpu_update(unsigned long cpu); +void mce_intel_handle_storm(int bank, bool on); void cmci_disable_bank(int bank); void intel_init_cmci(void); void intel_init_lmce(void); @@ -51,9 +49,7 @@ void intel_clear_lmce(void); bool intel_filter_mce(struct mce *m); bool intel_mce_usable_address(struct mce *m); #else -# define cmci_intel_adjust_timer mce_adjust_timer_default -static inline bool mce_intel_cmci_poll(void) { return false; } -static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void mce_intel_handle_storm(int bank, bool on) { } static inline void cmci_disable_bank(int bank) { } static inline void intel_init_cmci(void) { } static inline void intel_init_lmce(void) { } @@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; } static inline bool intel_mce_usable_address(struct mce *m) { return false; } #endif -void mce_timer_kick(unsigned long interval); +void mce_timer_kick(bool storm); + +#ifdef CONFIG_X86_MCE_THRESHOLD +void cmci_storm_begin(unsigned int bank); +void cmci_storm_end(unsigned int bank); +void mce_track_storm(struct mce *mce); +void mce_inherit_storm(unsigned int bank); +bool mce_get_storm_mode(void); +void mce_set_storm_mode(bool storm); +#else +static inline void cmci_storm_begin(unsigned int bank) {} +static inline void cmci_storm_end(unsigned int bank) {} +static inline void mce_track_storm(struct mce *mce) {} +static inline void mce_inherit_storm(unsigned int bank) {} +static inline bool mce_get_storm_mode(void) { return false; } +static inline void mce_set_storm_mode(bool storm) {} +#endif + +/* + * history: Bitmask tracking errors occurrence. Each set bit + * represents an error seen. + * + * timestamp: Last time (in jiffies) that the bank was polled. + * in_storm_mode: Is this bank in storm mode? + * poll_only: Bank does not support CMCI, skip storm tracking. + */ +struct storm_bank { + u64 history; + u64 timestamp; + bool in_storm_mode; + bool poll_only; +}; + +#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE) + +/* How many errors within the history buffer mark the start of a storm. */ +#define STORM_BEGIN_THRESHOLD 5 + +/* + * How many polls of machine check bank without an error before declaring + * the storm is over. Since it is tracked by the bitmasks in the history + * field of struct storm_bank the mask is 30 bits [0 ... 29]. + */ +#define STORM_END_POLL_THRESHOLD 29 + +/* + * banks: per-cpu, per-bank details + * stormy_bank_count: count of MC banks in storm state + * poll_mode: CPU is in poll mode + */ +struct mca_storm_desc { + struct storm_bank banks[MAX_NR_BANKS]; + u8 stormy_bank_count; + bool poll_mode; +}; + +DECLARE_PER_CPU(struct mca_storm_desc, storm_desc); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); @@ -209,6 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index c4477162c07d..dac4d64dfb2a 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -12,7 +12,7 @@ #include <linux/uaccess.h> #include <asm/mce.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/traps.h> #include <asm/insn.h> #include <asm/insn-eval.h> @@ -39,20 +39,20 @@ static struct severity { u64 mask; u64 result; unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; + unsigned short mcgmask; + unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; unsigned char covered; - unsigned char cpu_model; + unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h -#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -128,7 +128,7 @@ static struct severity { MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), - MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ @@ -174,6 +174,18 @@ static struct severity { USER ), MCESEV( + AR, "Data load error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( + AR, "Instruction fetch error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL @@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) switch (fixup_type) { case EX_TYPE_UACCESS: - case EX_TYPE_COPY: if (!copy_user) return IN_KERNEL; m->kflags |= MCE_IN_KERNEL_COPYIN; @@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char continue; if (s->excp && excp != s->excp) continue; - if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index ef4e7bb5fd88..f4a007616468 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); apic_eoi(); } + +DEFINE_PER_CPU(struct mca_storm_desc, storm_desc); + +void mce_inherit_storm(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + /* + * Previous CPU owning this bank had put it into storm mode, + * but the precise history of that storm is unknown. Assume + * the worst (all recent polls of the bank found a valid error + * logged). This will avoid the new owner prematurely declaring + * the storm has ended. + */ + storm->banks[bank].history = ~0ull; + storm->banks[bank].timestamp = jiffies; +} + +bool mce_get_storm_mode(void) +{ + return __this_cpu_read(storm_desc.poll_mode); +} + +void mce_set_storm_mode(bool storm) +{ + __this_cpu_write(storm_desc.poll_mode, storm); +} + +static void mce_handle_storm(unsigned int bank, bool on) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_handle_storm(bank, on); + break; + } +} + +void cmci_storm_begin(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __set_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].in_storm_mode = true; + + /* + * If this is the first bank on this CPU to enter storm mode + * start polling. + */ + if (++storm->stormy_bank_count == 1) + mce_timer_kick(true); +} + +void cmci_storm_end(unsigned int bank) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + + __clear_bit(bank, this_cpu_ptr(mce_poll_banks)); + storm->banks[bank].history = 0; + storm->banks[bank].in_storm_mode = false; + + /* If no banks left in storm mode, stop polling. */ + if (!--storm->stormy_bank_count) + mce_timer_kick(false); +} + +void mce_track_storm(struct mce *mce) +{ + struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); + unsigned long now = jiffies, delta; + unsigned int shift = 1; + u64 history = 0; + + /* No tracking needed for banks that do not support CMCI */ + if (storm->banks[mce->bank].poll_only) + return; + + /* + * When a bank is in storm mode it is polled once per second and + * the history mask will record about the last minute of poll results. + * If it is not in storm mode, then the bank is only checked when + * there is a CMCI interrupt. Check how long it has been since + * this bank was last checked, and adjust the amount of "shift" + * to apply to history. + */ + if (!storm->banks[mce->bank].in_storm_mode) { + delta = now - storm->banks[mce->bank].timestamp; + shift = (delta + HZ) / HZ; + } + + /* If it has been a long time since the last poll, clear history. */ + if (shift < NUM_HISTORY_BITS) + history = storm->banks[mce->bank].history << shift; + + storm->banks[mce->bank].timestamp = now; + + /* History keeps track of corrected errors. VAL=1 && UC=0 */ + if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce)) + history |= 1; + + storm->banks[mce->bank].history = history; + + if (storm->banks[mce->bank].in_storm_mode) { + if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0)) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, false); + cmci_storm_end(mce->bank); + } else { + if (hweight64(history) < STORM_BEGIN_THRESHOLD) + return; + printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank); + mce_handle_storm(mce->bank, true); + cmci_storm_begin(mce->bank); + } +} diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 13b45b9c806d..138689b8e1d8 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -23,17 +23,22 @@ #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/bsearch.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/initrd.h> #include <linux/kernel.h> #include <linux/pci.h> +#include <crypto/sha2.h> + #include <asm/microcode.h> #include <asm/processor.h> +#include <asm/cmdline.h> #include <asm/setup.h> #include <asm/cpu.h> #include <asm/msr.h> +#include <asm/tlb.h> #include "internal.h" @@ -84,13 +89,36 @@ struct microcode_amd { unsigned int mpb[]; }; -#define PATCH_MAX_SIZE (3 * PAGE_SIZE) - static struct equiv_cpu_table { unsigned int num_entries; struct equiv_cpu_entry *entry; } equiv_table; +union zen_patch_rev { + struct { + __u32 rev : 8, + stepping : 4, + model : 4, + __reserved : 4, + ext_model : 4, + ext_fam : 8; + }; + __u32 ucode_rev; +}; + +union cpuid_1_eax { + struct { + __u32 stepping : 4, + model : 4, + family : 4, + __reserved0 : 4, + ext_model : 4, + ext_fam : 8, + __reserved1 : 4; + }; + __u32 full; +}; + /* * This points to the current valid container of microcode patches which we will * save from the initrd/builtin before jettisoning its contents. @mc is the @@ -98,7 +126,6 @@ static struct equiv_cpu_table { */ struct cont_desc { struct microcode_amd *mc; - u32 cpuid_1_eax; u32 psize; u8 *data; size_t size; @@ -111,10 +138,149 @@ struct cont_desc { static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; +/* + * This is CPUID(1).EAX on the BSP. It is used in two ways: + * + * 1. To ignore the equivalence table on Zen1 and newer. + * + * 2. To match which patches to load because the patch revision ID + * already contains the f/m/s for which the microcode is destined + * for. + */ +static u32 bsp_cpuid_1_eax __ro_after_init; + +static bool sha_check = true; + +struct patch_digest { + u32 patch_id; + u8 sha256[SHA256_DIGEST_SIZE]; +}; + +#include "amd_shas.c" + +static int cmp_id(const void *key, const void *elem) +{ + struct patch_digest *pd = (struct patch_digest *)elem; + u32 patch_id = *(u32 *)key; + + if (patch_id == pd->patch_id) + return 0; + else if (patch_id < pd->patch_id) + return -1; + else + return 1; +} + +static bool need_sha_check(u32 cur_rev) +{ + switch (cur_rev >> 8) { + case 0x80012: return cur_rev <= 0x800126f; break; + case 0x80082: return cur_rev <= 0x800820f; break; + case 0x83010: return cur_rev <= 0x830107c; break; + case 0x86001: return cur_rev <= 0x860010e; break; + case 0x86081: return cur_rev <= 0x8608108; break; + case 0x87010: return cur_rev <= 0x8701034; break; + case 0x8a000: return cur_rev <= 0x8a0000a; break; + case 0xa0010: return cur_rev <= 0xa00107a; break; + case 0xa0011: return cur_rev <= 0xa0011da; break; + case 0xa0012: return cur_rev <= 0xa001243; break; + case 0xa0082: return cur_rev <= 0xa00820e; break; + case 0xa1011: return cur_rev <= 0xa101153; break; + case 0xa1012: return cur_rev <= 0xa10124e; break; + case 0xa1081: return cur_rev <= 0xa108109; break; + case 0xa2010: return cur_rev <= 0xa20102f; break; + case 0xa2012: return cur_rev <= 0xa201212; break; + case 0xa4041: return cur_rev <= 0xa404109; break; + case 0xa5000: return cur_rev <= 0xa500013; break; + case 0xa6012: return cur_rev <= 0xa60120a; break; + case 0xa7041: return cur_rev <= 0xa704109; break; + case 0xa7052: return cur_rev <= 0xa705208; break; + case 0xa7080: return cur_rev <= 0xa708009; break; + case 0xa70c0: return cur_rev <= 0xa70C009; break; + case 0xaa001: return cur_rev <= 0xaa00116; break; + case 0xaa002: return cur_rev <= 0xaa00218; break; + default: break; + } + + pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n"); + pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); + return true; +} + +static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len) +{ + struct patch_digest *pd = NULL; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state s; + int i; + + if (x86_family(bsp_cpuid_1_eax) < 0x17 || + x86_family(bsp_cpuid_1_eax) > 0x19) + return true; + + if (!need_sha_check(cur_rev)) + return true; + + if (!sha_check) + return true; + + pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id); + if (!pd) { + pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id); + return false; + } + + sha256_init(&s); + sha256_update(&s, data, len); + sha256_final(&s, digest); + + if (memcmp(digest, pd->sha256, sizeof(digest))) { + pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); + + for (i = 0; i < SHA256_DIGEST_SIZE; i++) + pr_cont("0x%x ", digest[i]); + pr_info("\n"); + + return false; + } + + return true; +} + +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + +static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) +{ + union zen_patch_rev p; + union cpuid_1_eax c; + + p.ucode_rev = val; + c.full = 0; + + c.stepping = p.stepping; + c.model = p.model; + c.ext_model = p.ext_model; + c.family = 0xf; + c.ext_fam = p.ext_fam; + + return c; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return 0; + if (!et || !et->num_entries) return 0; @@ -161,6 +327,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) if (!verify_container(buf, buf_size)) return false; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return true; + cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { pr_debug("Wrong microcode container equivalence table type: %u.\n", @@ -187,8 +357,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) * On success, @sh_psize returns the patch size according to the section header, * to the caller. */ -static bool -__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) +static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) { u32 p_type, p_size; const u32 *hdr; @@ -224,12 +393,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +static bool __verify_patch_size(u32 sh_psize, size_t buf_size) { + u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) - return min_t(u32, sh_psize, buf_size); + goto ret; #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 @@ -243,13 +413,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size break; default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); - return 0; + return false; } - if (sh_psize > min_t(u32, buf_size, max_size)) - return 0; + if (sh_psize > max_size) + return false; - return sh_psize; +ret: + /* Working with the whole buffer so < is ok. */ + return sh_psize <= buf_size; } /* @@ -260,11 +432,10 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size * positive: patch is not for this family, skip it * 0: success */ -static int -verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) +static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { + u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; - unsigned int ret; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -288,8 +459,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return -1; } - ret = __verify_patch_size(family, sh_psize, buf_size); - if (!ret) { + if (!__verify_patch_size(sh_psize, buf_size)) { pr_debug("Per-family patch size mismatch.\n"); return -1; } @@ -310,10 +480,19 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return 0; } +static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) +{ + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax; + else + return eq_id == mc->hdr.processor_rev_id; +} + /* * This scans the ucode blob for the proper container as we can have multiple - * containers glued together. Returns the equivalence ID from the equivalence - * table or 0 if none found. + * containers glued together. + * * Returns the amount of bytes consumed while scanning. @desc contains all the * data we're going to use in later stages of the application. */ @@ -338,7 +517,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) * doesn't contain a patch for the CPU, scan through the whole container * so that it can be skipped in case there are other containers appended. */ - eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + eq_id = find_equiv_id(&table, bsp_cpuid_1_eax); buf += hdr[2] + CONTAINER_HDR_SZ; size -= hdr[2] + CONTAINER_HDR_SZ; @@ -352,7 +531,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u32 patch_size; int ret; - ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size); + ret = verify_patch(buf, size, &patch_size); if (ret < 0) { /* * Patch verification failed, skip to the next container, if @@ -365,7 +544,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - if (eq_id == mc->hdr.processor_rev_id) { + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; } @@ -415,59 +594,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) } } -static int __apply_microcode_amd(struct microcode_amd *mc) +static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, + unsigned int psize) { - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + unsigned long p_addr = (unsigned long)&mc->hdr.data_code; - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc->hdr.patch_id) + if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) return -1; - return 0; -} - -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - * - * Returns true if container found (sets @desc), false otherwise. - */ -static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) -{ - struct cont_desc desc = { 0 }; - struct microcode_amd *mc; - bool ret = false; + native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); - desc.cpuid_1_eax = cpuid_1_eax; + if (x86_family(bsp_cpuid_1_eax) == 0x17) { + unsigned long p_addr_end = p_addr + psize - 1; - scan_containers(ucode, size, &desc); + invlpg(p_addr); - mc = desc.mc; - if (!mc) - return ret; + /* + * Flush next page too if patch image is crossing a page + * boundary. + */ + if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT) + invlpg(p_addr_end); + } - /* - * Allow application of the same revision to pick up SMT-specific - * changes even if the revision of the other SMT thread is already - * up-to-date. - */ - if (old_rev > mc->hdr.patch_id) - return ret; + /* verify patch application was successful */ + *cur_rev = get_patch_level(); + if (*cur_rev != mc->hdr.patch_id) + return false; - return !__apply_microcode_amd(mc); + return true; } -static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; + u8 family = x86_family(bsp_cpuid_1_eax); struct firmware fw; if (IS_ENABLED(CONFIG_X86_32)) @@ -486,85 +647,144 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) return false; } -static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +static bool __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; + bool found; - if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); - *ret = cp; + found = cp.data && cp.size; + if (found) + *ret = cp; + + return found; } +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { + struct cont_desc desc = { }; + struct microcode_amd *mc; struct cpio_data cp = { }; - u32 dummy; + char buf[4]; + u32 rev; + + if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) { + if (!strncmp(buf, "off", 3)) { + sha_check = false; + pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + } - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); + bsp_cpuid_1_eax = cpuid_1_eax; + + rev = get_patch_level(); + ed->old_rev = rev; /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) + if (!find_blobs_in_containers(&cp)) return; - if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); -} - -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); - -static int __init save_microcode_in_initrd(void) -{ - unsigned int cpuid_1_eax = native_cpuid_eax(1); - struct cpuinfo_x86 *c = &boot_cpu_data; - struct cont_desc desc = { 0 }; - enum ucode_state ret; - struct cpio_data cp; - - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) - return 0; + scan_containers(cp.data, cp.size, &desc); - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) - return -EINVAL; + mc = desc.mc; + if (!mc) + return; - desc.cpuid_1_eax = cpuid_1_eax; + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (ed->old_rev > mc->hdr.patch_id) + return; - scan_containers(cp.data, cp.size, &desc); - if (!desc.mc) - return -EINVAL; + if (__apply_microcode_amd(mc, &rev, desc.psize)) + ed->new_rev = rev; +} - ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret > UCODE_UPDATED) - return -EINVAL; +static inline bool patch_cpus_equivalent(struct ucode_patch *p, + struct ucode_patch *n, + bool ignore_stepping) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); + union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); + + if (ignore_stepping) { + p_cid.stepping = 0; + n_cid.stepping = 0; + } - return 0; + return p_cid.full == n_cid.full; + } else { + return p->equiv_cpu == n->equiv_cpu; + } } -early_initcall(save_microcode_in_initrd); /* * a small, trivial cache of per-family ucode patches */ -static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu) { struct ucode_patch *p; + struct ucode_patch n; + + n.equiv_cpu = equiv_cpu; + n.patch_id = uci->cpu_sig.rev; + + WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (p->equiv_cpu == equiv_cpu) + if (patch_cpus_equivalent(p, &n, false)) return p; + return NULL; } +static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union zen_patch_rev zp, zn; + + zp.ucode_rev = p->patch_id; + zn.ucode_rev = n->patch_id; + + if (zn.stepping != zp.stepping) + return -1; + + return zn.rev > zp.rev; + } else { + return n->patch_id > p->patch_id; + } +} + static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; + int ret; list_for_each_entry(p, µcode_cache, plist) { - if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) { + if (patch_cpus_equivalent(p, new_patch, true)) { + ret = patch_newer(p, new_patch); + if (ret < 0) + continue; + else if (!ret) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -595,13 +815,17 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u16 equiv_id; + u16 equiv_id = 0; - equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); - if (!equiv_id) - return NULL; + uci->cpu_sig.rev = get_patch_level(); + + if (x86_family(bsp_cpuid_1_eax) < 0x17) { + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); + if (!equiv_id) + return NULL; + } - return cache_find_patch(equiv_id); + return cache_find_patch(uci, equiv_id); } void reload_ucode_amd(unsigned int cpu) @@ -616,22 +840,20 @@ void reload_ucode_amd(unsigned int cpu) mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - + rev = get_patch_level(); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); + if (__apply_microcode_amd(mc, &rev, p->size)) + pr_info_once("reload revision: 0x%08x\n", rev); } } static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct ucode_patch *p; csig->sig = cpuid_eax(0x00000001); - csig->rev = c->microcode; + csig->rev = get_patch_level(); /* * a patch could have been loaded early, set uci->mc so that @@ -651,7 +873,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy __always_unused; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -661,18 +883,18 @@ static enum ucode_state apply_microcode_amd(int cpu) if (!p) return UCODE_NFOUND; + rev = uci->cpu_sig.rev; + mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* need to apply patch? */ if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } - if (__apply_microcode_amd(mc_amd)) { + if (!__apply_microcode_amd(mc_amd, &rev, p->size)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; @@ -711,6 +933,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) hdr = (const u32 *)buf; equiv_tbl_len = hdr[2]; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + goto out; + equiv_table.entry = vmalloc(equiv_tbl_len); if (!equiv_table.entry) { pr_err("failed to allocate equivalent CPU table\n"); @@ -720,12 +946,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); +out: /* add header length */ return equiv_tbl_len + CONTAINER_HDR_SZ; } static void free_equiv_cpu_table(void) { + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return; + vfree(equiv_table.entry); memset(&equiv_table, 0, sizeof(equiv_table)); } @@ -751,7 +981,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, u16 proc_id; int ret; - ret = verify_patch(family, fw, leftover, patch_size); + ret = verify_patch(fw, leftover, patch_size); if (ret) return ret; @@ -776,7 +1006,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -786,8 +1016,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, } /* Scan the blob in @data and add microcode patches to the cache. */ -static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, - size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { u8 *fw = (u8 *)data; size_t offset; @@ -820,23 +1049,32 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size) { - struct cpuinfo_x86 *c; - unsigned int nid, cpu; - struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - if (ret != UCODE_OK) { + if (ret != UCODE_OK) cleanup(); + + return ret; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + ret = _load_microcode_amd(family, data, size); + if (ret != UCODE_OK) return ret; - } - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); @@ -853,6 +1091,32 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } +static int __init save_microcode_in_initrd(void) +{ + unsigned int cpuid_1_eax = native_cpuid_eax(1); + struct cpuinfo_x86 *c = &boot_cpu_data; + struct cont_desc desc = { 0 }; + enum ucode_state ret; + struct cpio_data cp; + + if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + return 0; + + if (!find_blobs_in_containers(&cp)) + return -EINVAL; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} +early_initcall(save_microcode_in_initrd); + /* * AMD microcode firmware naming convention, up to family 15h they are in * the legacy file: diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c new file mode 100644 index 000000000000..2a1655b1fdd8 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_shas.c @@ -0,0 +1,444 @@ +/* Keep 'em sorted. */ +static const struct patch_digest phashes[] = { + { 0x8001227, { + 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b, + 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46, + 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8, + 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18, + } + }, + { 0x8001250, { + 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60, + 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa, + 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3, + 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19, + } + }, + { 0x800126e, { + 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c, + 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3, + 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6, + 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43, + } + }, + { 0x800126f, { + 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec, + 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b, + 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18, + 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33, + } + }, + { 0x800820d, { + 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59, + 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67, + 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c, + 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e, + } + }, + { 0x8301025, { + 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36, + 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1, + 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30, + 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77, + } + }, + { 0x8301055, { + 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a, + 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea, + 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b, + 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b, + } + }, + { 0x8301072, { + 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e, + 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28, + 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1, + 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30, + } + }, + { 0x830107a, { + 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72, + 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34, + 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20, + 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a, + } + }, + { 0x830107b, { + 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad, + 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a, + 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04, + 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19, + } + }, + { 0x830107c, { + 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47, + 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac, + 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3, + 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38, + } + }, + { 0x860010d, { + 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0, + 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7, + 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c, + 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8, + } + }, + { 0x8608108, { + 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2, + 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38, + 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc, + 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd, + } + }, + { 0x8701034, { + 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83, + 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d, + 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8, + 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b, + } + }, + { 0x8a00008, { + 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e, + 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e, + 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72, + 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c, + } + }, + { 0x8a0000a, { + 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c, + 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44, + 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb, + 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20, + } + }, + { 0xa00104c, { + 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe, + 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76, + 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b, + 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b, + } + }, + { 0xa00104e, { + 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2, + 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0, + 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e, + 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b, + } + }, + { 0xa001053, { + 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d, + 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25, + 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb, + 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3, + } + }, + { 0xa001058, { + 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36, + 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19, + 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec, + 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2, + } + }, + { 0xa001075, { + 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9, + 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a, + 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f, + 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0, + } + }, + { 0xa001078, { + 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25, + 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce, + 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04, + 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e, + } + }, + { 0xa001079, { + 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb, + 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93, + 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb, + 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a, + } + }, + { 0xa00107a, { + 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f, + 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd, + 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f, + 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18, + } + }, + { 0xa001143, { + 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80, + 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42, + 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d, + 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8, + } + }, + { 0xa001144, { + 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41, + 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b, + 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25, + 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc, + } + }, + { 0xa00115d, { + 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd, + 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75, + 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e, + 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05, + } + }, + { 0xa001173, { + 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a, + 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35, + 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3, + 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e, + } + }, + { 0xa0011a8, { + 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b, + 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6, + 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4, + 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e, + } + }, + { 0xa0011ce, { + 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71, + 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86, + 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e, + 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a, + } + }, + { 0xa0011d1, { + 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e, + 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09, + 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5, + 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8, + } + }, + { 0xa0011d3, { + 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b, + 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6, + 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00, + 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26, + } + }, + { 0xa0011d5, { + 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13, + 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7, + 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62, + 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21, + } + }, + { 0xa001223, { + 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8, + 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4, + 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15, + 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45, + } + }, + { 0xa001224, { + 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25, + 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad, + 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9, + 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a, + } + }, + { 0xa001227, { + 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad, + 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d, + 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9, + 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0, + } + }, + { 0xa001229, { + 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6, + 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75, + 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4, + 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d, + } + }, + { 0xa00122e, { + 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf, + 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15, + 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa, + 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10, + } + }, + { 0xa001231, { + 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e, + 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64, + 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b, + 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd, + } + }, + { 0xa001234, { + 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7, + 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc, + 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b, + 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a, + } + }, + { 0xa001236, { + 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78, + 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d, + 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b, + 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25, + } + }, + { 0xa001238, { + 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc, + 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a, + 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e, + 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59, + } + }, + { 0xa00820c, { + 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3, + 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63, + 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1, + 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2, + } + }, + { 0xa10113e, { + 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10, + 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0, + 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5, + 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53, + } + }, + { 0xa101144, { + 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26, + 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09, + 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc, + 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47, + } + }, + { 0xa101148, { + 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90, + 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f, + 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08, + 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4, + } + }, + { 0xa10123e, { + 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18, + 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d, + 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc, + 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b, + } + }, + { 0xa101244, { + 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c, + 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1, + 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72, + 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0, + } + }, + { 0xa101248, { + 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e, + 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22, + 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e, + 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75, + } + }, + { 0xa108108, { + 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9, + 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6, + 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c, + 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16, + } + }, + { 0xa20102d, { + 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11, + 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89, + 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23, + 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4, + } + }, + { 0xa201210, { + 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe, + 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9, + 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68, + 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41, + } + }, + { 0xa404107, { + 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45, + 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0, + 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81, + 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99, + } + }, + { 0xa500011, { + 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4, + 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1, + 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2, + 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74, + } + }, + { 0xa601209, { + 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32, + 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30, + 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46, + 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d, + } + }, + { 0xa704107, { + 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6, + 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93, + 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2, + 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39, + } + }, + { 0xa705206, { + 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4, + 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7, + 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b, + 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc, + } + }, + { 0xa708007, { + 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3, + 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2, + 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80, + 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93, + } + }, + { 0xa70c005, { + 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b, + 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f, + 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55, + 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13, + } + }, + { 0xaa00116, { + 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63, + 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5, + 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d, + 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9, + } + }, + { 0xaa00212, { + 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75, + 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71, + 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2, + 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1, + } + }, + { 0xaa00213, { + 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a, + 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f, + 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed, + 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b, + } + }, + { 0xaa00215, { + 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9, + 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4, + 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b, + 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef, + } + }, +}; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 232026a239a6..b3658d11e7b6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR); */ struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -struct cpu_info_ctx { - struct cpu_signature *cpu_sig; - int err; -}; - /* * Those patch levels cannot be updated to newer ones and thus should be final. */ diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 070426b9895f..f3d534807d91 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -21,7 +21,7 @@ #include <linux/uio.h> #include <linux/mm.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -319,12 +319,6 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, return UCODE_OK; } - /* - * Writeback and invalidate caches before updating microcode to avoid - * internal issues depending on what the microcode is updating. - */ - native_wbinvd(); - /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -370,14 +364,14 @@ static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info * { struct cpio_data cp; + intel_collect_cpu_info(&uci->cpu_sig); + if (!load_builtin_intel_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); if (!(cp.data && cp.size)) return NULL; - intel_collect_cpu_info(&uci->cpu_sig); - return scan_microcode(cp.data, cp.size, uci, save); } @@ -410,13 +404,13 @@ void __init load_ucode_intel_bsp(struct early_load_data *ed) { struct ucode_cpu_info uci; - ed->old_rev = intel_get_microcode_revision(); - uci.mc = get_microcode_blob(&uci, false); - if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) - ucode_patch_va = UCODE_BSP_LOADED; + ed->old_rev = uci.cpu_sig.rev; - ed->new_rev = uci.cpu_sig.rev; + if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) { + ucode_patch_va = UCODE_BSP_LOADED; + ed->new_rev = uci.cpu_sig.rev; + } } void load_ucode_intel_ap(void) @@ -457,12 +451,6 @@ static enum ucode_state apply_microcode_late(int cpu) if (ret != UCODE_UPDATED && ret != UCODE_OK) return ret; - if (!cpu && uci->cpu_sig.rev != cur_rev) { - pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n", - uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); - } - cpu_data(cpu).microcode = uci->cpu_sig.rev; if (!cpu) boot_cpu_data.microcode = uci->cpu_sig.rev; @@ -580,15 +568,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 * and LLC size per core bigger than 2.5MB may result in a system hang. - * This behavior is documented in item BDF90, #334165 (Intel Xeon + * This behavior is documented in item BDX90, #334165 (Intel Xeon * Processor E7-8800/4800 v4 Product Family). */ - if (c->x86 == 6 && - c->x86_model == INTEL_FAM6_BROADWELL_X && + if (c->x86_vfm == INTEL_BROADWELL_X && c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { - pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } @@ -647,7 +634,7 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c) { u64 llc_size = c->x86_cache_size * 1024ULL; - do_div(llc_size, c->x86_max_cores); + do_div(llc_size, topology_num_cores_per_package()); llc_size_per_core = (unsigned int)llc_size; } diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 21776c529fa9..5df621752fef 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -100,14 +100,12 @@ extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); -int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } -static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } static inline void exit_amd_microcode(void) { } diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 1db560ed2ca3..68f537347466 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -30,8 +30,7 @@ dump_array() # If the /* comment */ starts with a quote string, grab that. VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" - [ -z "$VALUE" ] && VALUE="\"$NAME\"" - [ "$VALUE" = '""' ] && continue + [ ! "$VALUE" ] && continue # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 01fa06dd06b6..f285757618fc 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -16,11 +16,10 @@ #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kexec.h> -#include <linux/i8253.h> #include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/desc.h> #include <asm/idtentry.h> @@ -45,70 +44,70 @@ bool hyperv_paravisor_present __ro_after_init; EXPORT_SYMBOL_GPL(hyperv_paravisor_present); #if IS_ENABLED(CONFIG_HYPERV) -static inline unsigned int hv_get_nested_reg(unsigned int reg) +static inline unsigned int hv_get_nested_msr(unsigned int reg) { - if (hv_is_sint_reg(reg)) - return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0; + if (hv_is_sint_msr(reg)) + return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0; switch (reg) { - case HV_REGISTER_SIMP: - return HV_REGISTER_NESTED_SIMP; - case HV_REGISTER_SIEFP: - return HV_REGISTER_NESTED_SIEFP; - case HV_REGISTER_SVERSION: - return HV_REGISTER_NESTED_SVERSION; - case HV_REGISTER_SCONTROL: - return HV_REGISTER_NESTED_SCONTROL; - case HV_REGISTER_EOM: - return HV_REGISTER_NESTED_EOM; + case HV_X64_MSR_SIMP: + return HV_X64_MSR_NESTED_SIMP; + case HV_X64_MSR_SIEFP: + return HV_X64_MSR_NESTED_SIEFP; + case HV_X64_MSR_SVERSION: + return HV_X64_MSR_NESTED_SVERSION; + case HV_X64_MSR_SCONTROL: + return HV_X64_MSR_NESTED_SCONTROL; + case HV_X64_MSR_EOM: + return HV_X64_MSR_NESTED_EOM; default: return reg; } } -u64 hv_get_non_nested_register(unsigned int reg) +u64 hv_get_non_nested_msr(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; } -EXPORT_SYMBOL_GPL(hv_get_non_nested_register); +EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); -void hv_set_non_nested_register(unsigned int reg, u64 value) +void hv_set_non_nested_msr(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) { hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ - if (hv_is_sint_reg(reg)) + if (hv_is_sint_msr(reg)) wrmsrl(reg, value | 1 << 20); } else { wrmsrl(reg, value); } } -EXPORT_SYMBOL_GPL(hv_set_non_nested_register); +EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); -u64 hv_get_register(unsigned int reg) +u64 hv_get_msr(unsigned int reg) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - return hv_get_non_nested_register(reg); + return hv_get_non_nested_msr(reg); } -EXPORT_SYMBOL_GPL(hv_get_register); +EXPORT_SYMBOL_GPL(hv_get_msr); -void hv_set_register(unsigned int reg, u64 value) +void hv_set_msr(unsigned int reg, u64 value) { if (hv_nested) - reg = hv_get_nested_reg(reg); + reg = hv_get_nested_msr(reg); - hv_set_non_nested_register(reg, value); + hv_set_non_nested_msr(reg, value); } -EXPORT_SYMBOL_GPL(hv_set_register); +EXPORT_SYMBOL_GPL(hv_set_msr); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); @@ -199,8 +198,8 @@ static void hv_machine_shutdown(void) * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor * corrupts the old VP Assist Pages and can crash the kexec kernel. */ - if (kexec_in_progress && hyperv_init_cpuhp > 0) - cpuhp_remove_state(hyperv_init_cpuhp); + if (kexec_in_progress) + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); /* The function calls stop_other_cpus(). */ native_machine_shutdown(); @@ -209,7 +208,9 @@ static void hv_machine_shutdown(void) if (kexec_in_progress) hyperv_cleanup(); } +#endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_CRASH_DUMP static void hv_machine_crash_shutdown(struct pt_regs *regs) { if (hv_crash_handler) @@ -221,7 +222,64 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) /* Disable the hypercall page when there is only 1 active CPU. */ hyperv_cleanup(); } -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -350,13 +408,24 @@ static void __init reduced_hw_init(void) x86_init.irqs.pre_vector_init = x86_init_noop; } +int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) +{ + unsigned int hv_max_functions; + + hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); + if (hv_max_functions < HYPERV_CPUID_VERSION) { + pr_err("%s: Could not detect Hyper-V version\n", __func__); + return -ENODEV; + } + + cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx); + + return 0; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; - int hv_host_info_eax; - int hv_host_info_ebx; - int hv_host_info_ecx; - int hv_host_info_edx; #ifdef CONFIG_PARAVIRT pv_info.name = "Hyper-V"; @@ -407,25 +476,11 @@ static void __init ms_hyperv_init_platform(void) pr_info("Hyper-V: running on a nested hypervisor\n"); } - /* - * Extract host information. - */ - if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) { - hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION); - hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION); - hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION); - hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION); - - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", - hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF, - hv_host_info_eax, hv_host_info_edx & 0xFFFFFF, - hv_host_info_ecx, hv_host_info_edx >> 24); - } - if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); } if (ms_hyperv.priv_high & HV_ISOLATION) { @@ -451,10 +506,24 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; if (!ms_hyperv.paravisor_present) { - /* To be supported: more work is required. */ + /* + * Mark the Hyper-V TSC page feature as disabled + * in a TDX VM without paravisor so that the + * Invariant TSC, which is a better clocksource + * anyway, is used instead. + */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; - /* HV_REGISTER_CRASH_CTL is unsupported. */ + /* + * The Invariant TSC is expected to be available + * in a TDX VM without paravisor, but if not, + * print a warning message. The slower Hyper-V MSR-based + * Ref Counter should end up being the clocksource. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + pr_warn("Hyper-V: Invariant TSC is unavailable\n"); + + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; /* Don't trust Hyper-V's TLB-flushing hypercalls. */ @@ -495,10 +564,14 @@ static void __init ms_hyperv_init_platform(void) no_timer_check = 1; #endif -#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE) +#if IS_ENABLED(CONFIG_HYPERV) +#if defined(CONFIG_KEXEC_CORE) machine_ops.shutdown = hv_machine_shutdown; +#endif +#if defined(CONFIG_CRASH_DUMP) machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif +#endif if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* * Writing to synthetic MSR 0x40000118 updates/changes the @@ -520,16 +593,6 @@ static void __init ms_hyperv_init_platform(void) if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; - /* - * Hyper-V VMs have a PIT emulation quirk such that zeroing the - * counter register during PIT shutdown restarts the PIT. So it - * continues to interrupt @18.2 HZ. Setting i8253_clear_counter - * to false tells pit_shutdown() not to zero the counter so that - * the PIT really is shutdown. Generation 2 VMs don't have a PIT, - * and setting this value has no effect. - */ - i8253_clear_counter_on_shutdown = false; - #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || ms_hyperv.paravisor_present) @@ -539,19 +602,18 @@ static void __init ms_hyperv_init_platform(void) */ x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); - /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); - /* Setup the IDT for reenlightenment notifications */ + /* Install system interrupt handler for hypervisor callback */ + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); + + /* Install system interrupt handler for reenlightenment notifications */ if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) { - alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - asm_sysvec_hyperv_reenlightenment); + sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); } - /* Setup the IDT for stimer0 */ + /* Install system interrupt handler for stimer0 */ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { - alloc_intr_gate(HYPERV_STIMER0_VECTOR, - asm_sysvec_hyperv_stimer0); + sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); } # ifdef CONFIG_SMP @@ -574,6 +636,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* @@ -643,6 +706,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, + .init.guest_late_init = ms_hyperv_late_init, #ifdef CONFIG_AMD_MEM_ENCRYPT .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 2d6aa5d2e3d7..2fdfda2b60e4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) + return; + rdmsr(MSR_AMD64_SYSCFG, lo, hi); if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" @@ -420,7 +423,7 @@ void __init mtrr_copy_map(void) } /** - * mtrr_overwrite_state - set static MTRR state + * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). @@ -428,9 +431,13 @@ void __init mtrr_copy_map(void) * from the x86_init.hyper.init_platform() hook. It can be called only once. * The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR * is cleared. + * + * @var: MTRR variable range array to use + * @num_var: length of the @var array + * @def_type: default caching type */ -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type) +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) { unsigned int i; @@ -492,13 +499,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform) /** * mtrr_type_lookup - look up memory type in MTRR * + * @start: Begin of the physical address range + * @end: End of the physical address range + * @uniform: output argument: + * - 1: the returned MTRR type is valid for the whole region + * - 0: otherwise + * * Return Values: * MTRR_TYPE_(type) - The effective MTRR type for the region * MTRR_TYPE_INVALID - MTRR is disabled - * - * Output Argument: - * uniform - Set to 1 when the returned MTRR type is valid for the whole - * region, set to 0 else. */ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 767bf1c71aad..ecbda0341a8a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -55,6 +55,12 @@ #include "mtrr.h" +static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); +static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); +static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); +static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); +static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); + /* arch_phys_wc_add returns an MTRR register index plus this offset. */ #define MTRR_TO_PHYS_WC_OFFSET 1000 @@ -609,7 +615,7 @@ void mtrr_save_state(void) { int first_cpu; - if (!mtrr_enabled()) + if (!mtrr_enabled() || !mtrr_state.have_fixed) return; first_cpu = cpumask_first(cpu_online_mask); @@ -619,7 +625,7 @@ void mtrr_save_state(void) static int __init mtrr_init_finalize(void) { /* - * Map might exist if mtrr_overwrite_state() has been called or if + * Map might exist if guest_force_mtrr_state() has been called or if * mtrr_enabled() returns true. */ mtrr_copy_map(); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e65fae63660e..41ed01f46bd9 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), + str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), + str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 26a427fa84ea..eeac00d20926 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -6,6 +6,7 @@ * Authors: Fenghua Yu <fenghua.yu@intel.com>, * H. Peter Anvin <hpa@linux.intel.com> */ +#include <linux/printk.h> #include <asm/processor.h> #include <asm/archrandom.h> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 19e0681f0435..3d1735ed8d1f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -16,17 +16,24 @@ #define pr_fmt(fmt) "resctrl: " fmt +#include <linux/cpu.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/cacheinfo.h> #include <linux/cpuhotplug.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/resctrl.h> #include "internal.h" -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); +/* + * rdt_domain structures are kfree()d when their last CPU goes offline, + * and allocated when the first CPU in a new domain comes online. + * The rdt_resource's domain list is updated when this happens. Readers of + * the domain list must either take cpus_read_lock(), or rely on an RCU + * read-side critical section, to avoid observing concurrent modification. + * All writers take this mutex: + */ +static DEFINE_MUTEX(domain_list_lock); /* * The cached resctrl_pqr_state is strictly per CPU and can never be @@ -48,16 +55,12 @@ int max_name_width, max_data_width; */ bool rdt_alloc_capable; -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); +static void mba_wrmsr_intel(struct msr_param *m); +static void cat_wrmsr(struct msr_param *m); +static void mba_wrmsr_amd(struct msr_param *m); -#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) +#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains) +#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains) struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_L3] = @@ -65,8 +68,10 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L3, .name = "L3", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_L3), + .ctrl_scope = RESCTRL_L3_CACHE, + .mon_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3), + .mon_domains = mon_domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, @@ -79,8 +84,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_L2, .name = "L2", - .cache_level = 2, - .domains = domain_init(RDT_RESOURCE_L2), + .ctrl_scope = RESCTRL_L2_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", .fflags = RFTYPE_RES_CACHE, @@ -93,8 +98,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_MBA, .name = "MB", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_MBA), + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, @@ -105,8 +110,8 @@ struct rdt_hw_resource rdt_resources_all[] = { .r_resctrl = { .rid = RDT_RESOURCE_SMBA, .name = "SMBA", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_SMBA), + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), .parse_ctrlval = parse_bw, .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, @@ -114,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = { }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -136,15 +149,15 @@ static inline void cache_alloc_hsw_probe(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3]; struct rdt_resource *r = &hw_res->r_resctrl; - u32 l, h, max_cbm = BIT_MASK(20) - 1; + u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0)) + if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsr(MSR_IA32_L3_CBM_BASE, l, h); + rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ - if (l != max_cbm) + if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; @@ -194,7 +207,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) return false; } -static bool __get_mem_config_intel(struct rdt_resource *r) +static __init bool __get_mem_config_intel(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; @@ -221,19 +234,19 @@ static bool __get_mem_config_intel(struct rdt_resource *r) r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; else r->membw.throttle_mode = THREAD_THROTTLE_MAX; - thread_throttle_mode_init(); + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); r->alloc_capable = true; return true; } -static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - union cpuid_0x10_3_eax eax; - union cpuid_0x10_x_edx edx; - u32 ebx, ecx, subleaf; + u32 eax, ebx, ecx, edx, subleaf; /* * Query CPUID_Fn80000020_EDX_x01 for MBA and @@ -241,9 +254,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) */ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1; - cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full); - hw_res->num_closid = edx.split.cos_max + 1; - r->default_ctrl = MAX_MBA_BW_AMD; + cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); + hw_res->num_closid = edx + 1; + r->default_ctrl = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -303,12 +316,11 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2); } -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void mba_wrmsr_amd(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); @@ -328,37 +340,51 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) return r->default_ctrl; } -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r) +static void mba_wrmsr_intel(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r)); + wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void cat_wrmsr(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r) { - struct rdt_domain *d; + struct rdt_ctrl_domain *d; - list_for_each_entry(d, &r->domains, list) { + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) return d; } @@ -372,40 +398,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r) void rdt_ctrl_update(void *arg) { + struct rdt_hw_resource *hw_res; struct msr_param *m = arg; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_resource *r = m->res; - int cpu = smp_processor_id(); - struct rdt_domain *d; - d = get_domain_from_cpu(cpu, r); - if (d) { - hw_res->msr_update(d, m, r); - return; - } - pr_warn_once("cpu %d not found in any domain for resource %s\n", - cpu, r->name); + hw_res = resctrl_to_arch_res(m->res); + hw_res->msr_update(m); } /* - * rdt_find_domain - Find a domain in a resource that matches input resource id + * rdt_find_domain - Search for a domain id in a resource domain list. * - * Search resource r's domain list to find the resource id. If the resource - * id is found in a domain, return the domain. Otherwise, if requested by - * caller, return the first domain whose id is bigger than the input id. - * The domain list is sorted by id in ascending order. + * Search the domain list to find the domain id. If the domain id is + * found, return the domain. NULL otherwise. If the domain id is not + * found (and NULL returned) then the first domain with id bigger than + * the input id can be returned to the caller via @pos. */ -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, + struct list_head **pos) { - struct rdt_domain *d; + struct rdt_domain_hdr *d; struct list_head *l; - if (id < 0) - return ERR_PTR(-ENODEV); - - list_for_each(l, &r->domains) { - d = list_entry(l, struct rdt_domain, list); + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); /* When id is found, return its domain. */ if (id == d->id) return d; @@ -434,18 +449,23 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) *dc = r->default_ctrl; } -static void domain_free(struct rdt_hw_domain *hw_dom) +static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) +{ + kfree(hw_dom->ctrl_val); + kfree(hw_dom); +} + +static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { kfree(hw_dom->arch_mbm_total); kfree(hw_dom->arch_mbm_local); - kfree(hw_dom->ctrl_val); kfree(hw_dom); } -static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; u32 *dc; @@ -457,9 +477,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) hw_dom->ctrl_val = dc; setup_default_ctrlval(r, dc); + m.res = r; + m.dom = d; m.low = 0; m.high = hw_res->num_closid; - hw_res->msr_update(d, &m, r); + hw_res->msr_update(&m); return 0; } @@ -468,7 +490,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; @@ -491,35 +513,45 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) return 0; } -/* - * domain_add_cpu - Add a cpu to a resource's domain list. - * - * If an existing domain in the resource r's domain list matches the cpu's - * resource id, add the cpu in the domain. - * - * Otherwise, a new domain is allocated and inserted into the right position - * in the domain list sorted by id in ascending order. - * - * The order in the domain list is visible to users when we print entries - * in the schemata file and schemata input is validated to have the same order - * as this list. - */ -static void domain_add_cpu(int cpu, struct rdt_resource *r) +static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) +{ + switch (scope) { + case RESCTRL_L2_CACHE: + case RESCTRL_L3_CACHE: + return get_cpu_cacheinfo_id(cpu, scope); + case RESCTRL_L3_NODE: + return cpu_to_node(cpu); + default: + break; + } + + return -EINVAL; +} + +static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); + struct rdt_hw_ctrl_domain *hw_dom; struct list_head *add_pos = NULL; - struct rdt_hw_domain *hw_dom; - struct rdt_domain *d; + struct rdt_domain_hdr *hdr; + struct rdt_ctrl_domain *d; int err; - d = rdt_find_domain(r, id, &add_pos); - if (IS_ERR(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); return; } - if (d) { - cpumask_set_cpu(cpu, &d->cpu_mask); + hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + d = container_of(hdr, struct rdt_ctrl_domain, hdr); + + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (r->cache.arch_has_per_cpu_cfg) rdt_domain_reconfigure_cdp(r); return; @@ -530,125 +562,226 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d = &hw_dom->d_resctrl; - d->id = id; - cpumask_set_cpu(cpu, &d->cpu_mask); + d->hdr.id = id; + d->hdr.type = RESCTRL_CTRL_DOMAIN; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); - if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - domain_free(hw_dom); + if (domain_setup_ctrlval(r, d)) { + ctrl_domain_free(hw_dom); + return; + } + + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_ctrl_domain(r, d); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + ctrl_domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_hw_mon_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; + int err; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); return; } - if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { - domain_free(hw_dom); + hdr = rdt_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + d = container_of(hdr, struct rdt_mon_domain, hdr); + + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); return; } - list_add_tail(&d->list, add_pos); + hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!hw_dom) + return; + + d = &hw_dom->d_resctrl; + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!d->ci) { + pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); + mon_domain_free(hw_dom); + return; + } + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - err = resctrl_online_domain(r, d); + arch_mon_domain_online(r, d); + + if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + mon_domain_free(hw_dom); + return; + } + + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, d); if (err) { - list_del(&d->list); - domain_free(hw_dom); + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + mon_domain_free(hw_dom); } } -static void domain_remove_cpu(int cpu, struct rdt_resource *r) +static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); - struct rdt_hw_domain *hw_dom; - struct rdt_domain *d; + if (r->alloc_capable) + domain_add_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_add_cpu_mon(cpu, r); +} - d = rdt_find_domain(r, id, NULL); - if (IS_ERR_OR_NULL(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); +static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); + struct rdt_hw_ctrl_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_ctrl_domain *d; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); + return; + } + + hdr = rdt_find_domain(&r->ctrl_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); return; } - hw_dom = resctrl_to_arch_dom(d); - cpumask_clear_cpu(cpu, &d->cpu_mask); - if (cpumask_empty(&d->cpu_mask)) { - resctrl_offline_domain(r, d); - list_del(&d->list); + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_ctrl_domain, hdr); + hw_dom = resctrl_to_arch_ctrl_dom(d); + + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&d->hdr.list); + synchronize_rcu(); /* - * rdt_domain "d" is going to be freed below, so clear + * rdt_ctrl_domain "d" is going to be freed below, so clear * its pointer from pseudo_lock_region struct. */ if (d->plr) d->plr->d = NULL; - domain_free(hw_dom); + ctrl_domain_free(hw_dom); return; } +} - if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(r, d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0); - } +static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct rdt_hw_mon_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; } + + hdr = rdt_find_domain(&r->mon_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); + return; + } + + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { + resctrl_offline_mon_domain(r, d); + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + mon_domain_free(hw_dom); + + return; + } +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + if (r->alloc_capable) + domain_remove_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_remove_cpu_mon(cpu, r); } static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - state->default_closid = 0; - state->default_rmid = 0; - state->cur_closid = 0; - state->cur_rmid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, 0, 0); + state->default_closid = RESCTRL_RESERVED_CLOSID; + state->default_rmid = RESCTRL_RESERVED_RMID; + state->cur_closid = RESCTRL_RESERVED_CLOSID; + state->cur_rmid = RESCTRL_RESERVED_RMID; + wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID, + RESCTRL_RESERVED_CLOSID); } -static int resctrl_online_cpu(unsigned int cpu) +static int resctrl_arch_online_cpu(unsigned int cpu) { struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_add_cpu(cpu, r); - /* The cpu is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); + resctrl_online_cpu(cpu); return 0; } -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { - break; - } - } -} - -static int resctrl_offline_cpu(unsigned int cpu) +static int resctrl_arch_offline_cpu(unsigned int cpu) { - struct rdtgroup *rdtgrp; struct rdt_resource *r; - mutex_lock(&rdtgroup_mutex); + resctrl_offline_cpu(cpu); + + mutex_lock(&domain_list_lock); for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } + mutex_unlock(&domain_list_lock); + clear_closid_rmid(cpu); - mutex_unlock(&rdtgroup_mutex); return 0; } @@ -830,23 +963,28 @@ static __init bool get_rdt_mon_resources(void) if (!rdt_mon_features) return false; + if (is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + return !rdt_get_mon_l3_config(r); } static __init void __check_quirks_intel(void) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_HASWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_HASWELL_X: if (!rdt_options[RDT_FLAG_L3_CAT].force_off) cache_alloc_hsw_probe(); break; - case INTEL_FAM6_SKYLAKE_X: + case INTEL_SKYLAKE_X: if (boot_cpu_data.x86_stepping <= 4) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); fallthrough; - case INTEL_FAM6_BROADWELL_X: + case INTEL_BROADWELL_X: intel_rdt_mbm_apply_quirk(); break; } @@ -968,7 +1106,8 @@ static int __init resctrl_late_init(void) state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", - resctrl_online_cpu, resctrl_offline_cpu); + resctrl_arch_online_cpu, + resctrl_arch_offline_cpu); if (state < 0) return state; @@ -992,8 +1131,14 @@ late_initcall(resctrl_late_init); static void __exit resctrl_exit(void) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + cpuhp_remove_state(rdt_online); + rdtgroup_exit(); + + if (r->mon_capable) + rdt_put_mon_l3_config(); } __exitcall(resctrl_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index beccb0e87ba7..536351159cc2 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -19,6 +19,8 @@ #include <linux/kernfs.h> #include <linux/seq_file.h> #include <linux/slab.h> +#include <linux/tick.h> + #include "internal.h" /* @@ -27,10 +29,10 @@ * hardware. The allocated bandwidth percentage is rounded to the next * control step available on the hardware. */ -static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) { - unsigned long bw; int ret; + u32 bw; /* * Only linear delay values is supported for current Intel SKUs. @@ -40,16 +42,21 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) return false; } - ret = kstrtoul(buf, 10, &bw); + ret = kstrtou32(buf, 10, &bw); if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); + rdt_last_cmd_printf("Invalid MB value %s\n", buf); return false; } - if ((bw < r->membw.min_bw || bw > r->default_ctrl) && - !is_mba_sc(r)) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->default_ctrl) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->default_ctrl); return false; } @@ -58,16 +65,16 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) } int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; - unsigned long bw_val; + u32 bw_val; cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); return -EINVAL; } @@ -137,7 +144,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) * resource type. */ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { struct rdtgroup *rdtgrp = data->rdtgrp; struct resctrl_staged_config *cfg; @@ -146,7 +153,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, cfg = &d->staged_config[s->conf_type]; if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); return -EINVAL; } @@ -206,10 +213,13 @@ static int parse_line(char *line, struct resctrl_schema *s, struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; struct rdt_parse_data data; + struct rdt_ctrl_domain *d; char *dom = NULL, *id; - struct rdt_domain *d; unsigned long dom_id; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); @@ -226,8 +236,8 @@ next: return -EINVAL; } dom = strim(dom); - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { data.buf = dom; data.rdtgrp = rdtgrp; if (r->parse_ctrlval(&data, s, d)) @@ -267,39 +277,24 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type) } } -static bool apply_config(struct rdt_hw_domain *hw_dom, - struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask) -{ - struct rdt_domain *dom = &hw_dom->d_resctrl; - - if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { - cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - hw_dom->ctrl_val[idx] = cfg->new_ctrl; - - return true; - } - - return false; -} - -int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); u32 idx = get_config_index(closid, t); struct msr_param msr_param; - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) return -EINVAL; hw_dom->ctrl_val[idx] = cfg_val; msr_param.res = r; + msr_param.dom = d; msr_param.low = idx; msr_param.high = idx + 1; - hw_res->msr_update(d, &msr_param, r); + hw_res->msr_update(&msr_param); return 0; } @@ -307,48 +302,42 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; + struct rdt_ctrl_domain *d; enum resctrl_conf_type t; - cpumask_var_t cpu_mask; - struct rdt_domain *d; u32 idx; - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); - msr_param.res = NULL; - list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + hw_dom = resctrl_to_arch_ctrl_dom(d); + msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { cfg = &hw_dom->d_resctrl.staged_config[t]; if (!cfg->have_new_ctrl) continue; idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask)) + if (cfg->new_ctrl == hw_dom->ctrl_val[idx]) continue; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; if (!msr_param.res) { msr_param.low = idx; msr_param.high = msr_param.low + 1; msr_param.res = r; + msr_param.dom = d; } else { msr_param.low = min(msr_param.low, idx); msr_param.high = max(msr_param.high, idx + 1); } } + if (msr_param.res) + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - if (cpumask_empty(cpu_mask)) - goto done; - - /* Update resource control msr on all the CPUs. */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - -done: - free_cpumask_var(cpu_mask); - return 0; } @@ -379,11 +368,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return -EINVAL; buf[nbytes - 1] = '\0'; - cpus_read_lock(); rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return -ENOENT; } rdt_last_cmd_clear(); @@ -445,14 +432,13 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, out: rdt_staged_configs_clear(); rdtgroup_kn_unlock(of->kn); - cpus_read_unlock(); return ret ?: nbytes; } -u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); u32 idx = get_config_index(closid, type); return hw_dom->ctrl_val[idx]; @@ -461,12 +447,15 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) { struct rdt_resource *r = schema->res; - struct rdt_domain *dom; + struct rdt_ctrl_domain *dom; bool sep = false; u32 ctrl_val; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -476,7 +465,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo ctrl_val = resctrl_arch_get_config(r, dom, closid, schema->conf_type); - seq_printf(s, r->format_str, dom->id, max_data_width, + seq_printf(s, r->format_str, dom->hdr.id, max_data_width, ctrl_val); sep = true; } @@ -505,7 +494,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, } else { seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->s->res->name, - rdtgrp->plr->d->id, + rdtgrp->plr->d->hdr.id, rdtgrp->plr->cbm); } } else { @@ -522,32 +511,132 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!strcmp(buf, "mbm_local_bytes")) { + if (is_mbm_local_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else + ret = -EINVAL; + } else if (!strcmp(buf, "mbm_total_bytes")) { + if (is_mbm_total_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (ret) + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + switch (rdtgrp->mba_mbps_event) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + seq_puts(s, "mbm_local_bytes\n"); + break; + case QOS_L3_MBM_TOTAL_EVENT_ID: + seq_puts(s, "mbm_total_bytes\n"); + break; + default: + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); + ret = -EINVAL; + break; + } + } else { + ret = -ENOENT; + } + + rdtgroup_kn_unlock(of->kn); + + return ret; +} + void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first) + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first) { + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + /* - * setup the parameters to send to the IPI to read the data. + * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; rr->r = r; rr->d = d; - rr->val = 0; rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(cpumask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; + struct rdt_domain_hdr *hdr; + struct rmid_read rr = {0}; + struct rdt_mon_domain *d; u32 resid, evtid, domid; struct rdtgroup *rdtgrp; struct rdt_resource *r; union mon_data_bits md; - struct rdt_domain *d; - struct rmid_read rr; int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -560,15 +649,40 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md.u.rid; domid = md.u.domid; evtid = md.u.evtid; - r = &rdt_resources_all[resid].r_resctrl; - d = rdt_find_domain(r, domid, NULL); - if (IS_ERR_OR_NULL(d)) { + + if (md.u.sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * mon_data_bits union. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci->id == domid) { + rr.ci = d->ci; + mon_event_read(&rr, r, NULL, rdtgrp, + &d->ci->shared_cpu_map, evtid, false); + goto checkresult; + } + } ret = -ENOENT; goto out; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = rdt_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + ret = -ENOENT; + goto out; + } + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); } - mon_event_read(&rr, r, d, rdtgrp, evtid, false); +checkresult: if (rr.err == -EIO) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index a4f1aa15f0a2..20c898f09b7e 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -7,6 +7,9 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> #include <linux/jump_label.h> +#include <linux/tick.h> + +#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL @@ -18,7 +21,6 @@ #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 -#define MAX_MBA_BW_AMD 0x800 #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) @@ -54,6 +56,47 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu, hk_cpu; + + if (exclude_cpu == RESCTRL_PICK_ANY_CPU) + cpu = cpumask_any(mask); + else + cpu = cpumask_any_but(mask, exclude_cpu); + + /* Only continue if tick_nohz_full_mask has been initialized. */ + if (!tick_nohz_full_enabled()) + return cpu; + + /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ + if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) + return cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); + if (hk_cpu == exclude_cpu) + hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); + + if (hk_cpu < nr_cpu_ids) + cpu = hk_cpu; + + return cpu; +} + struct rdt_fs_context { struct kernfs_fs_context kfc; bool enable_cdpl2; @@ -69,9 +112,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) return container_of(kfc, struct rdt_fs_context, kfc); } -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - /** * struct mon_evt - Entry in the event list of a resource * @evtid: event id @@ -87,37 +127,62 @@ struct mon_evt { }; /** - * union mon_data_bits - Monitoring details for each event file + * union mon_data_bits - Monitoring details for each event file. * @priv: Used to store monitoring event data in @u - * as kernfs private data - * @rid: Resource id associated with the event file - * @evtid: Event id associated with the event file - * @domid: The domain to which the event file belongs - * @u: Name of the bit fields struct + * as kernfs private data. + * @u.rid: Resource id associated with the event file. + * @u.evtid: Event id associated with the event file. + * @u.sum: Set when event must be summed across multiple + * domains. + * @u.domid: When @u.sum is zero this is the domain to which + * the event file belongs. When @sum is one this + * is the id of the L3 cache that all domains to be + * summed share. + * @u: Name of the bit fields struct. */ union mon_data_bits { void *priv; struct { unsigned int rid : 10; - enum resctrl_event_id evtid : 8; + enum resctrl_event_id evtid : 7; + unsigned int sum : 1; unsigned int domid : 14; } u; }; +/** + * struct rmid_read - Data passed across smp_call*() to read event count. + * @rgrp: Resource group for which the counter is being read. If it is a parent + * resource group then its event count is summed with the count from all + * its child resource groups. + * @r: Resource describing the properties of the event being read. + * @d: Domain that the counter should be read from. If NULL then sum all + * domains in @r sharing L3 @ci.id + * @evtid: Which monitor event to read. + * @first: Initialize MBM counter when true. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @err: Error encountered when reading counter. + * @val: Returned value of event counter. If @rgrp is a parent resource group, + * @val includes the sum of event counts from its child resource groups. + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, + * (summed across child resource groups if @rgrp is a parent resource group). + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). + */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_domain *d; + struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; + struct cacheinfo *ci; int err; u64 val; + void *arch_mon_ctx; }; -extern bool rdt_alloc_capable; -extern bool rdt_mon_capable; extern unsigned int rdt_mon_features; extern struct list_head resctrl_schema_all; +extern bool resctrl_mounted; enum rdt_group_type { RDTCTRL_GROUP = 0, @@ -192,7 +257,7 @@ struct mongroup { */ struct pseudo_lock_region { struct resctrl_schema *s; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; u32 cbm; wait_queue_head_t lock_thread_wq; int thread_done; @@ -218,6 +283,7 @@ struct pseudo_lock_region { * monitor only or ctrl_mon group * @mon: mongroup related data * @mode: mode of resource group + * @mba_mbps_event: input monitoring event id when mba_sc is enabled * @plr: pseudo-locked region */ struct rdtgroup { @@ -230,6 +296,7 @@ struct rdtgroup { enum rdt_group_type type; struct mongroup mon; enum rdtgrp_mode mode; + enum resctrl_event_id mba_mbps_event; struct pseudo_lock_region *plr; }; @@ -296,14 +363,10 @@ struct rftype { * struct mbm_state - status for each MBM counter in each domain * @prev_bw_bytes: Previous bytes value read for bandwidth calculation * @prev_bw: The most recent bandwidth in MBps - * @delta_bw: Difference between the current and previous bandwidth - * @delta_comp: Indicates whether to compute the delta_bw */ struct mbm_state { u64 prev_bw_bytes; u32 prev_bw; - u32 delta_bw; - bool delta_comp; }; /** @@ -319,35 +382,53 @@ struct arch_mbm_state { }; /** - * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share - * a resource + * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share + * a resource for a control function * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * + * Members of this structure are accessed via helpers that provide abstraction. + */ +struct rdt_hw_ctrl_domain { + struct rdt_ctrl_domain d_resctrl; + u32 *ctrl_val; +}; + +/** + * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share + * a resource for a monitor function + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_total: arch private state for MBM total bandwidth * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_domain { - struct rdt_domain d_resctrl; - u32 *ctrl_val; +struct rdt_hw_mon_domain { + struct rdt_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_total; struct arch_mbm_state *arch_mbm_local; }; -static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) +static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) +{ + return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); +} + +static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) { - return container_of(r, struct rdt_hw_domain, d_resctrl); + return container_of(r, struct rdt_hw_mon_domain, d_resctrl); } /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use + * @dom: The domain to update * @low: Beginning index from base MSR * @high: End index */ struct msr_param { struct rdt_resource *res; + struct rdt_ctrl_domain *dom; u32 low; u32 high; }; @@ -395,6 +476,8 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. + * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth + * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -405,10 +488,10 @@ struct rdt_hw_resource { struct rdt_resource r_resctrl; u32 num_closid; unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); + void (*msr_update)(struct msr_param *m); unsigned int mon_scale; unsigned int mbm_width; + unsigned int mbm_cfg_mask; bool cdp_enabled; }; @@ -418,17 +501,16 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r } int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); + struct rdt_ctrl_domain *d); extern struct mutex rdtgroup_mutex; extern struct rdt_hw_resource rdt_resources_all[]; extern struct rdtgroup rdtgroup_default; -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - extern struct dentry *debugfs_resctrl; +extern enum resctrl_event_id mba_mbps_default_event; enum resctrl_res_level { RDT_RESOURCE_L3, @@ -455,6 +537,8 @@ static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); + /* * To return the common struct rdt_resource, which is contained in struct * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. @@ -520,50 +604,58 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn); int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, umode_t mask); -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos); +struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id, + struct list_head **pos); ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int rdtgroup_schemata_show(struct kernfs_open_file *of, struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, unsigned long cbm); enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); int rdtgroup_tasks_assigned(struct rdtgroup *r); int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); int rdt_pseudo_lock_init(void); void rdt_pseudo_lock_release(void); int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r); +struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r); int closids_supported(void); void closid_free(int closid); -int alloc_rmid(void); -void free_rmid(u32 rmid); +int alloc_rmid(u32 closid); +void free_rmid(u32 closid, u32 rmid); int rdt_get_mon_l3_config(struct rdt_resource *r); +void __exit rdt_put_mon_l3_config(void); bool __init rdt_cpu_has(int flag); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first); -void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms); + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, + unsigned long delay_ms, + int exclude_cpu); void mbm_handle_overflow(struct work_struct *work); void __init intel_rdt_mbm_apply_quirk(void); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); -void __check_limbo(struct rdt_domain *d, bool force_free); +bool has_busy_rmid(struct rdt_mon_domain *d); +void __check_limbo(struct rdt_mon_domain *d, bool force_free); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void __init thread_throttle_mode_init(void); -void __init mbm_config_rftype_init(const char *config); +void resctrl_file_fflags_init(const char *config, unsigned long fflags); void rdt_staged_configs_clear(void); - +bool closid_allocated(unsigned int closid); +int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f136ac046851..94a1d9780461 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,6 +15,9 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/cpu.h> #include <linux/module.h> #include <linux/sizes.h> #include <linux/slab.h> @@ -23,8 +26,22 @@ #include <asm/resctrl.h> #include "internal.h" - +#include "trace.h" + +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ struct rmid_entry { + u32 closid; u32 rmid; int busy; struct list_head list; @@ -38,6 +55,13 @@ struct rmid_entry { static LIST_HEAD(rmid_free_lru); /* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* * @rmid_limbo_count - count of currently unused but (potentially) * dirty RMIDs. * This counts RMIDs that no one is currently using but that @@ -75,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit; #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) +static int snc_nodes_per_l3_cache = 1; + /* * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied @@ -136,17 +162,70 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) return val; } -static inline struct rmid_entry *__rmid_entry(u32 rmid) +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) { struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - entry = &rmid_ptrs[rmid]; - WARN_ON(entry->rmid != rmid); + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); return entry; } -static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +/* + * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by + * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is + * needed. The physical RMID is the same as the logical RMID. + * + * On a platform with SNC mode enabled, Linux enables RMID sharing mode + * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel + * Resource Director Technology Architecture Specification" for a full + * description of RMID sharing mode). + * + * In RMID sharing mode there are fewer "logical RMID" values available + * to accumulate data ("physical RMIDs" are divided evenly between SNC + * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * each SNC node. + * + * The value loaded into IA32_PQR_ASSOC is the "logical RMID". + * + * Data is collected independently on each SNC node and can be retrieved + * using the "physical RMID" value computed by this function and loaded + * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. + * + * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 + * cache. So a "physical RMID" may be read from any CPU that shares + * the L3 cache with the desired SNC node, not just from a CPU in + * the specific SNC node. + */ +static int logical_rmid_to_physical_rmid(int cpu, int lrmid) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + if (snc_nodes_per_l3_cache == 1) + return lrmid; + + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; +} + +static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) { u64 msr_val; @@ -158,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) * are error bits. */ - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); rdmsrl(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) @@ -170,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -189,18 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, return NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid) +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, + enum resctrl_event_id eventid) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; + u32 prmid; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { memset(am, 0, sizeof(*am)); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); /* Record any initial, non-zero count value. */ - __rmid_read(rmid, eventid, &am->prev_msr); + __rmid_read_phys(prmid, eventid, &am->prev_msr); } } @@ -208,9 +291,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); if (is_mbm_total_enabled()) memset(hw_dom->arch_mbm_total, 0, @@ -229,19 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, - u32 rmid, enum resctrl_event_id eventid, u64 *val) +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 unused, u32 rmid, enum resctrl_event_id eventid, + u64 *val, void *ignored) { + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u64 msr_val, chunks; + u32 prmid; int ret; - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) - return -EINVAL; + resctrl_arch_rmid_read_context_check(); - ret = __rmid_read(rmid, eventid, &msr_val); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); if (ret) return ret; @@ -260,20 +346,40 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + /* * Check the RMIDs that are marked as busy for this domain. If the * reported LLC occupancy is below the threshold clear the busy bit and * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_domain *d, bool force_free) +void __check_limbo(struct rdt_mon_domain *d, bool force_free) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; - u32 crmid = 1, nrmid; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; bool rmid_dirty; u64 val = 0; + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + /* * Skip RMID 0 and start from RMID 1 and check all the RMIDs that * are marked as busy for occupancy < threshold. If the occupancy @@ -281,101 +387,187 @@ void __check_limbo(struct rdt_domain *d, bool force_free) * RMID and move it to the free list when the counter reaches 0. */ for (;;) { - nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); - if (nrmid >= r->num_rmid) + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) break; - entry = __rmid_entry(nrmid); - - if (resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val)) { + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { rmid_dirty = true; } else { rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); } if (force_free || !rmid_dirty) { - clear_bit(entry->rmid, d->rmid_busy_llc); - if (!--entry->busy) { - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - } + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); } - crmid = nrmid + 1; + cur_idx = idx + 1; } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +bool has_busy_rmid(struct rdt_mon_domain *d) { - return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; } /* - * As of now the RMIDs allocation is global. - * However we keep track of which packages the RMIDs - * are used to optimize the limbo list management. + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. */ -int alloc_rmid(void) +int alloc_rmid(u32 closid) { struct rmid_entry *entry; lockdep_assert_held(&rdtgroup_mutex); - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? -EBUSY : -ENOSPC; + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); - entry = list_first_entry(&rmid_free_lru, - struct rmid_entry, list); list_del(&entry->list); - return entry->rmid; } static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_domain *d; - int cpu, err; - u64 val = 0; + struct rdt_mon_domain *d; + u32 idx; - entry->busy = 0; - cpu = get_cpu(); - list_for_each_entry(d, &r->domains, list) { - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - err = resctrl_arch_rmid_read(r, d, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, - &val); - if (err || val <= resctrl_rmid_realloc_threshold) - continue; - } + lockdep_assert_held(&rdtgroup_mutex); + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); + + entry->busy = 0; + list_for_each_entry(d, &r->mon_domains, hdr.list) { /* * For the first limbo RMID in the domain, * setup up the limbo worker. */ - if (!has_busy_rmid(r, d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); - set_bit(entry->rmid, d->rmid_busy_llc); + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); entry->busy++; } - put_cpu(); - if (entry->busy) - rmid_limbo_count++; - else - list_add_tail(&entry->list, &rmid_free_lru); + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; } -void free_rmid(u32 rmid) +void free_rmid(u32 closid, u32 rmid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); struct rmid_entry *entry; - if (!rmid) - return; - lockdep_assert_held(&rdtgroup_mutex); - entry = __rmid_entry(rmid); + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); if (is_llc_occupancy_enabled()) add_rmid_to_limbo(entry); @@ -383,44 +575,84 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid, - enum resctrl_event_id evtid) +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) { + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + switch (evtid) { case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[rmid]; + return &d->mbm_total[idx]; case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[rmid]; + return &d->mbm_local[idx]; default: return NULL; } } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; struct mbm_state *m; + int err, ret; u64 tval = 0; if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid); - m = get_mbm_state(rr->d, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval); - if (rr->err) - return rr->err; + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; } /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. * @rmid: The rmid used to identify the cached mbm_state. * @rr: The struct rmid_read populated by __mon_event_count(). * @@ -429,10 +661,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) * __mon_event_count() is compared with the chunks value from the previous * invocation. This must be called once per second to maintain values in MBps. */ -static void mbm_bw_count(u32 rmid, struct rmid_read *rr) +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) { - struct mbm_state *m = &rr->d->mbm_local[rmid]; u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; cur_bytes = rr->val; bytes = cur_bytes - m->prev_bw_bytes; @@ -440,14 +676,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) cur_bw = bytes / SZ_1M; - if (m->delta_comp) - m->delta_bw = abs(cur_bw - m->prev_bw); - m->delta_comp = false; m->prev_bw = cur_bw; } /* - * This is called via IPI to read the CQM/MBM counters + * This is scheduled by mon_event_read() to read the CQM/MBM counters * on a domain. */ void mon_event_count(void *info) @@ -459,7 +692,7 @@ void mon_event_count(void *info) rdtgrp = rr->rgrp; - ret = __mon_event_count(rdtgrp->mon.rmid, rr); + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); /* * For Ctrl groups read data from child monitor groups and @@ -470,7 +703,8 @@ void mon_event_count(void *info) if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr) == 0) + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) ret = 0; } } @@ -516,26 +750,27 @@ void mon_event_count(void *info) * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; - u32 cur_bw, delta_bw, user_bw; + struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; struct rdt_resource *r_mba; - struct rdt_domain *dom_mba; struct list_head *head; struct rdtgroup *entry; - - if (!is_mbm_local_enabled()) - return; + u32 cur_bw, user_bw; r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; + evt_id = rgrp->mba_mbps_event; closid = rgrp->closid; rmid = rgrp->mon.rmid; - pmbm_data = &dom_mbm->mbm_local[rmid]; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; - dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); if (!dom_mba) { pr_warn_once("Failure to get domain for MBA update\n"); return; @@ -543,7 +778,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) cur_bw = pmbm_data->prev_bw; user_bw = dom_mba->mbps_val[closid]; - delta_bw = pmbm_data->delta_bw; /* MBA resource doesn't support CDP */ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); @@ -553,83 +787,76 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) */ head = &rgrp->mon.crdtgrp_list; list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; cur_bw += cmbm_data->prev_bw; - delta_bw += cmbm_data->delta_bw; } /* * Scale up/down the bandwidth linearly for the ctrl group. The * bandwidth step is the bandwidth granularity specified by the * hardware. - * - * The delta_bw is used when increasing the bandwidth so that we - * dont alternately increase and decrease the control values - * continuously. - * - * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if - * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep - * switching between 90 and 110 continuously if we only check - * cur_bw < user_bw. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". */ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { new_msr_val = cur_msr_val - r_mba->membw.bw_gran; } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw + delta_bw))) { + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { new_msr_val = cur_msr_val + r_mba->membw.bw_gran; } else { return; } resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); - - /* - * Delta values are updated dynamically package wise for each - * rdtgrp every time the throttle MSR changes value. - * - * This is because (1)the increase in bandwidth is not perfectly - * linear and only "approximately" linear even when the hardware - * says it is linear.(2)Also since MBA is a core specific - * mechanism, the delta values vary based on number of cores used - * by the rdtgrp. - */ - pmbm_data->delta_comp = true; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cmbm_data->delta_comp = true; - } } -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) { - struct rmid_read rr; + struct rmid_read rr = {0}; - rr.first = false; rr.r = r; rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); /* - * This is protected from concurrent reads from user - * as both the user and we hold the global mutex. + * If the software controller is enabled, compute the + * bandwidth for this event id. */ - if (is_mbm_total_enabled()) { - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; - rr.val = 0; - __mon_event_count(rmid, &rr); - } - if (is_mbm_local_enabled()) { - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - rr.val = 0; - __mon_event_count(rmid, &rr); + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); - /* - * Call the MBA software controller only for the - * control groups and when user has enabled - * the software controller explicitly. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(rmid, &rr); - } + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} + +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + + if (is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } /* @@ -639,106 +866,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - int cpu = smp_processor_id(); - struct rdt_resource *r; - struct rdt_domain *d; + struct rdt_mon_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, cqm_limbo.work); + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); __check_limbo(d, false); - if (has_busy_rmid(r, d)) - schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->cqm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); } void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - int cpu = smp_processor_id(); + struct rdt_mon_domain *d; struct list_head *head; struct rdt_resource *r; - struct rdt_domain *d; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) goto out_unlock; r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, mbm_over.work); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); } - schedule_delayed_work_on(cpu, &d->mbm_over, delay); + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); out_unlock: mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); } -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); int cpu; - if (!static_branch_likely(&rdt_mon_enable_key)) + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) return; - cpu = cpumask_any(&dom->cpu_mask); + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); dom->mbm_work_cpu = cpu; - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } static int dom_data_init(struct rdt_resource *r) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int i, nr_rmids; + int err = 0, i; + u32 idx; - nr_rmids = r->num_rmid; - rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) - return -ENOMEM; + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; - for (i = 0; i < nr_rmids; i++) { + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; INIT_LIST_HEAD(&entry->list); - entry->rmid = i; + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); list_add_tail(&entry->list, &rmid_free_lru); } /* - * RMID 0 is special and is always allocated. It's used for all - * tasks that are not monitored. + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in rdtgroup_init(). */ - entry = __rmid_entry(0); + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); list_del(&entry->list); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void __exit dom_data_exit(void) +{ + mutex_lock(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + + mutex_unlock(&rdtgroup_mutex); } static struct mon_evt llc_occupancy_event = { @@ -775,6 +1089,89 @@ static void l3_mon_evt_init(struct rdt_resource *r) list_add_tail(&mbm_local_event.list, &r->evt_list); } +/* + * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 + * which indicates that RMIDs are configured in legacy mode. + * This mode is incompatible with Linux resctrl semantics + * as RMIDs are partitioned between SNC nodes, which requires + * a user to know which RMID is allocated to a task. + * Clearing bit 0 reconfigures the RMID counters for use + * in RMID sharing mode. This mode is better for Linux. + * The RMID space is divided between all SNC nodes with the + * RMIDs renumbered to start from zero in each node when + * counting operations from tasks. Code to read the counters + * must adjust RMID counter numbers based on SNC node. See + * logical_rmid_to_physical_rmid() for code that does this. + */ +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + if (snc_nodes_per_l3_cache > 1) + msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); +} + +/* CPU models that support MSR_RMID_SNC_CONFIG */ +static const struct x86_cpu_id snc_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + {} +}; + +/* + * There isn't a simple hardware bit that indicates whether a CPU is running + * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the + * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in + * the same NUMA node as CPU0. + * It is not possible to accurately determine SNC state if the system is + * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes + * to L3 caches. It will be OK if system is booted with hyperthreading + * disabled (since this doesn't affect the ratio). + */ +static __init int snc_get_config(void) +{ + struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); + const cpumask_t *node0_cpumask; + int cpus_per_node, cpus_per_l3; + int ret; + + if (!x86_match_cpu(snc_cpu_ids) || !ci) + return 1; + + cpus_read_lock(); + if (num_online_cpus() != num_present_cpus()) + pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); + cpus_read_unlock(); + + node0_cpumask = cpumask_of_node(cpu_to_node(0)); + + cpus_per_node = cpumask_weight(node0_cpumask); + cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); + + if (!cpus_per_node || !cpus_per_l3) + return 1; + + ret = cpus_per_l3 / cpus_per_node; + + /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ + switch (ret) { + case 1: + break; + case 2 ... 4: + case 6: + pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); + rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; + break; + default: + pr_warn("Ignore improbable SNC node count %d\n", ret); + ret = 1; + break; + } + + return ret; +} + int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -782,9 +1179,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int threshold; int ret; + snc_nodes_per_l3_cache = snc_get_config(); + resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; - hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; - r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; + r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -813,13 +1212,21 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return ret; if (rdt_cpu_has(X86_FEATURE_BMEC)) { + u32 eax, ebx, ecx, edx; + + /* Detect list of bandwidth sources that can be tracked */ + cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); + hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { mbm_total_event.configurable = true; - mbm_config_rftype_init("mbm_total_bytes_config"); + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { mbm_local_event.configurable = true; - mbm_config_rftype_init("mbm_local_bytes_config"); + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } } @@ -830,6 +1237,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } +void __exit rdt_put_mon_l3_config(void) +{ + dom_data_exit(); +} + void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 8f559eeae08e..42cc162f7fc9 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,7 +11,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/debugfs.h> @@ -23,7 +22,7 @@ #include <linux/uaccess.h> #include <asm/cacheflush.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/resctrl.h> #include <asm/perf_event.h> @@ -31,7 +30,7 @@ #include "internal.h" #define CREATE_TRACE_POINTS -#include "pseudo_lock_event.h" +#include "trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -88,8 +87,8 @@ static u64 get_prefetch_disable_bits(void) boot_cpu_data.x86 != 6) return 0; - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -100,8 +99,8 @@ static u64 get_prefetch_disable_bits(void) * 63:4 Reserved */ return 0xF; - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -221,7 +220,7 @@ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) int cpu; int ret; - for_each_cpu(cpu, &plr->d->cpu_mask) { + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); if (!pm_req) { rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); @@ -292,12 +291,15 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) */ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) { - struct cpu_cacheinfo *ci; + enum resctrl_scope scope = plr->s->res->ctrl_scope; + struct cacheinfo *ci; int ret; - int i; + + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) + return -ENODEV; /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->cpu_mask); + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); if (!cpu_online(plr->cpu)) { rdt_last_cmd_printf("CPU %u associated with cache not online\n", @@ -306,15 +308,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr) goto out_region; } - ci = get_cpu_cacheinfo(plr->cpu); - - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == plr->s->res->cache_level) { - plr->line_size = ci->info_list[i].coherency_line_size; - return 0; - } + ci = get_cpu_cacheinfo_level(plr->cpu, scope); + if (ci) { + plr->line_size = ci->coherency_line_size; + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + return 0; } ret = -1; @@ -461,7 +459,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * increase likelihood that allocated cache portion will be filled * with associated memory. */ - native_wbinvd(); + wbinvd(); /* * Always called with interrupts enabled. By disabling interrupts @@ -581,7 +579,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); if (ret) goto err_cpus_list; @@ -628,7 +626,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) if (ret) goto err_cpus; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); if (ret) goto err_cpus_list; @@ -752,7 +750,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) * anymore when this group would be used for pseudo-locking. This * is safe to call on platforms not capable of monitoring. */ - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); ret = 0; goto out; @@ -776,8 +774,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) { int ret; - if (rdt_mon_capable) { - ret = alloc_rmid(); + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Out of RMIDs\n"); return ret; @@ -787,7 +785,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) ret = rdtgroup_locksetup_user_restore(rdtgrp); if (ret) { - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); return ret; } @@ -810,7 +808,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) * Return: true if @cbm overlaps with pseudo-locked region on @d, false * otherwise. */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) { unsigned int cbm_len; unsigned long cbm_b; @@ -837,13 +835,16 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm * if it is not possible to test due to memory allocation issue, * false otherwise. */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) { + struct rdt_ctrl_domain *d_i; cpumask_var_t cpu_with_psl; struct rdt_resource *r; - struct rdt_domain *d_i; bool ret = false; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) return true; @@ -852,10 +853,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * associated with them. */ for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->domains, list) { + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { if (d_i->plr) cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->cpu_mask); + &d_i->hdr.cpu_mask); } } @@ -863,7 +864,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * Next test if new pseudo-locked region would intersect with * existing region. */ - if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) ret = true; free_cpumask_var(cpu_with_psl); @@ -1081,9 +1082,9 @@ static int measure_l2_residency(void *_plr) * L2_HIT 02H * L2_MISS 10H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: perf_miss_attr.config = X86_CONFIG(.event = 0xd1, .umask = 0x10); perf_hit_attr.config = X86_CONFIG(.event = 0xd1, @@ -1120,8 +1121,8 @@ static int measure_l3_residency(void *_plr) * MISS 41H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* On BDW the hit event counts references, not hits */ perf_hit_attr.config = X86_CONFIG(.event = 0x2e, .umask = 0x4f); @@ -1139,7 +1140,7 @@ static int measure_l3_residency(void *_plr) */ counts.miss_after -= counts.miss_before; - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { /* * On BDW references and misses are counted, need to adjust. * Sometimes the "hits" counter is a bit more than the @@ -1195,7 +1196,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) } plr->thread_done = 0; - cpu = cpumask_first(&plr->d->cpu_mask); + cpu = cpumask_first(&plr->d->hdr.cpu_mask); if (!cpu_online(cpu)) { ret = -ENODEV; goto out; @@ -1204,20 +1205,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) plr->cpu = cpu; if (sel == 1) - thread = kthread_create_on_node(measure_cycles_lat_fn, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr, + cpu, "pseudo_lock_measure/%u"); else if (sel == 2) - thread = kthread_create_on_node(measure_l2_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_l2_residency, plr, + cpu, "pseudo_lock_measure/%u"); else if (sel == 3) - thread = kthread_create_on_node(measure_l3_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); + thread = kthread_run_on_cpu(measure_l3_residency, plr, + cpu, "pseudo_lock_measure/%u"); else goto out; @@ -1225,8 +1220,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) ret = PTR_ERR(thread); goto out; } - kthread_bind(thread, cpu); - wake_up_process(thread); ret = wait_event_interruptible(plr->lock_thread_wq, plr->thread_done == 1); @@ -1314,18 +1307,14 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) plr->thread_done = 0; - thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, - cpu_to_node(plr->cpu), - "pseudo_lock/%u", plr->cpu); + thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp, + plr->cpu, "pseudo_lock/%u"); if (IS_ERR(thread)) { ret = PTR_ERR(thread); rdt_last_cmd_printf("Locking thread returned error %d\n", ret); goto out_cstates; } - kthread_bind(thread, plr->cpu); - wake_up_process(thread); - ret = wait_event_interruptible(plr->lock_thread_wq, plr->thread_done == 1); if (ret < 0) { @@ -1525,7 +1514,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * may be scheduled elsewhere and invalidate entries in the * pseudo-locked region. */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } @@ -1566,7 +1555,6 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) static const struct file_operations pseudo_lock_dev_fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .read = NULL, .write = NULL, .open = pseudo_lock_dev_open, diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 69a1de92384a..6419e04d8a7b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -12,7 +12,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/fs.h> @@ -35,6 +34,10 @@ DEFINE_STATIC_KEY_FALSE(rdt_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); @@ -42,6 +45,9 @@ LIST_HEAD(rdt_all_groups); /* list of entries for the schemata file */ LIST_HEAD(resctrl_schema_all); +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; @@ -59,6 +65,15 @@ static void rdtgroup_destroy_root(void); struct dentry *debugfs_resctrl; +/* + * Memory bandwidth monitoring event to use for the default CTRL_MON group + * and each new CTRL_MON group created by the user. Only relevant when + * the filesystem is mounted with the "mba_MBps" option so it does not + * matter that it remains uninitialized on systems that do not support + * the "mba_MBps" option. + */ +enum resctrl_event_id mba_mbps_default_event; + static bool resctrl_debug; void rdt_last_cmd_clear(void) @@ -85,13 +100,13 @@ void rdt_last_cmd_printf(const char *fmt, ...) void rdt_staged_configs_clear(void) { + struct rdt_ctrl_domain *dom; struct rdt_resource *r; - struct rdt_domain *dom; lockdep_assert_held(&rdtgroup_mutex); for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->domains, list) + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) memset(dom->staged_config, 0, sizeof(dom->staged_config)); } } @@ -102,7 +117,7 @@ void rdt_staged_configs_clear(void) * * Using a global CLOSID across all resources has some advantages and * some drawbacks: - * + We can simply set "current->closid" to assign a task to a resource + * + We can simply set current's closid to assign a task to a resource * group. * + Context switch code can avoid extra memory references deciding which * CLOSID to load into the PQR_ASSOC MSR @@ -111,7 +126,7 @@ void rdt_staged_configs_clear(void) * - Our choices on how to configure each resource become progressively more * limited as the number of resources grows. */ -static int closid_free_map; +static unsigned long closid_free_map; static int closid_free_map_len; int closids_supported(void) @@ -130,26 +145,39 @@ static void closid_init(void) closid_free_map = BIT_MASK(rdt_min_closid) - 1; - /* CLOSID 0 is always reserved for the default group */ - closid_free_map &= ~1; + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); closid_free_map_len = rdt_min_closid; } static int closid_alloc(void) { - u32 closid = ffs(closid_free_map); + int cleanest_closid; + u32 closid; - if (closid == 0) - return -ENOSPC; - closid--; - closid_free_map &= ~(1 << closid); + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = ffs(closid_free_map); + if (closid == 0) + return -ENOSPC; + closid--; + } + __clear_bit(closid, &closid_free_map); return closid; } void closid_free(int closid) { - closid_free_map |= 1 << closid; + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, &closid_free_map); } /** @@ -159,9 +187,11 @@ void closid_free(int closid) * Return: true if @closid is currently associated with a resource group, * false if @closid is free */ -static bool closid_allocated(unsigned int closid) +bool closid_allocated(unsigned int closid) { - return (closid_free_map & (1 << closid)) == 0; + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, &closid_free_map); } /** @@ -295,7 +325,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of, rdt_last_cmd_puts("Cache domain offline\n"); ret = -ENODEV; } else { - mask = &rdtgrp->plr->d->cpu_mask; + mask = &rdtgrp->plr->d->hdr.cpu_mask; seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", cpumask_pr_args(mask)); @@ -559,14 +589,26 @@ static void update_task_closid_rmid(struct task_struct *t) _update_task_closid_rmid(t); } +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + static int __rdtgroup_move_task(struct task_struct *tsk, struct rdtgroup *rdtgrp) { /* If the task is already in rdtgrp, no need to move the task. */ - if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && - tsk->rmid == rdtgrp->mon.rmid) || - (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && - tsk->closid == rdtgrp->mon.parent->closid)) + if (task_in_rdtgroup(tsk, rdtgrp)) return 0; /* @@ -577,19 +619,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk, * For monitor groups, can move the tasks only from * their parent CTRL group. */ - - if (rdtgrp->type == RDTCTRL_GROUP) { - WRITE_ONCE(tsk->closid, rdtgrp->closid); - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else if (rdtgrp->type == RDTMON_GROUP) { - if (rdtgrp->mon.parent->closid == tsk->closid) { - WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid); - } else { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; } + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. @@ -611,14 +653,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk, static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_alloc_capable && - (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); } static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) { - return (rdt_mon_capable && - (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); } /** @@ -853,7 +896,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, mutex_lock(&rdtgroup_mutex); /* Return empty if resctrl has not been mounted. */ - if (!static_branch_unlikely(&rdt_enable_key)) { + if (!resctrl_mounted) { seq_puts(s, "res:\nmon:\n"); goto unlock; } @@ -869,7 +912,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, rdtg->mode != RDT_MODE_EXCLUSIVE) continue; - if (rdtg->closid != tsk->closid) + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) continue; seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", @@ -877,7 +920,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, seq_puts(s, "mon:"); list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, mon.crdtgrp_list) { - if (tsk->rmid != crg->mon.rmid) + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) continue; seq_printf(s, "%s", crg->kn->name); break; @@ -976,20 +1020,21 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, unsigned long sw_shareable = 0, hw_shareable = 0; unsigned long exclusive = 0, pseudo_locked = 0; struct rdt_resource *r = s->res; - struct rdt_domain *dom; + struct rdt_ctrl_domain *dom; int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; bool sep = false; u32 ctrl_val; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(seq, ';'); sw_shareable = 0; exclusive = 0; - seq_printf(seq, "%d=", dom->id); + seq_printf(seq, "%d=", dom->hdr.id); for (i = 0; i < closids_supported(); i++) { if (!closid_allocated(i)) continue; @@ -1042,6 +1087,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } seq_putc(seq, '\n'); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1205,7 +1251,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, * * Return: false if CBM does not overlap, true if it does. */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, enum resctrl_conf_type type, bool exclusive) { @@ -1260,7 +1306,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d * * Return: true if CBM overlap detected, false if there is no overlap */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, unsigned long cbm, int closid, bool exclusive) { enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); @@ -1291,18 +1337,21 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) { int closid = rdtgrp->closid; + struct rdt_ctrl_domain *d; struct resctrl_schema *s; struct rdt_resource *r; bool has_cache = false; - struct rdt_domain *d; u32 ctrl; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + list_for_each_entry(s, &resctrl_schema_all, list) { r = s->res; if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) continue; has_cache = true; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { ctrl = resctrl_arch_get_config(r, d, closid, s->conf_type); if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { @@ -1407,20 +1456,19 @@ out: * bitmap functions work correctly. */ unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_domain *d, unsigned long cbm) + struct rdt_ctrl_domain *d, unsigned long cbm) { - struct cpu_cacheinfo *ci; unsigned int size = 0; - int num_b, i; + struct cacheinfo *ci; + int num_b; + + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + return size; num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == r->cache_level) { - size = ci->info_list[i].size / r->cache.cbm_len * num_b; - break; - } - } + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); + if (ci) + size = ci->size / r->cache.cbm_len * num_b; return size; } @@ -1436,9 +1484,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, { struct resctrl_schema *schema; enum resctrl_conf_type type; + struct rdt_ctrl_domain *d; struct rdtgroup *rdtgrp; struct rdt_resource *r; - struct rdt_domain *d; unsigned int size; int ret = 0; u32 closid; @@ -1462,7 +1510,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, rdtgrp->plr->d, rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); } goto out; } @@ -1474,7 +1522,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, type = schema->conf_type; sep = false; seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(s, ';'); if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -1492,7 +1540,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of, else size = rdtgroup_cbm_to_size(r, d, ctrl); } - seq_printf(s, "%d=%u", d->id, size); + seq_printf(s, "%d=%u", d->hdr.id, size); sep = true; } seq_putc(s, '\n'); @@ -1550,20 +1598,21 @@ static void mon_event_config_read(void *info) mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) +static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info) { - smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); + smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1); } static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { - struct mon_config_info mon_info = {0}; - struct rdt_domain *dom; + struct mon_config_info mon_info; + struct rdt_mon_domain *dom; bool sep = false; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - list_for_each_entry(dom, &r->domains, list) { + list_for_each_entry(dom, &r->mon_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -1571,12 +1620,13 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid mon_info.evtid = evtid; mondata_config_read(dom, &mon_info); - seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); sep = true; } seq_puts(s, "\n"); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return 0; } @@ -1614,17 +1664,10 @@ static void mon_event_config_write(void *info) wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); } -static int mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_mon_domain *d, u32 evtid, u32 val) { struct mon_config_info mon_info = {0}; - int ret = 0; - - /* mon_config cannot be more than the supported set of events */ - if (val > MAX_EVT_CONFIG_BITS) { - rdt_last_cmd_puts("Invalid event configuration\n"); - return -EINVAL; - } /* * Read the current config value first. If both are the same then @@ -1633,7 +1676,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, mon_info.evtid = evtid; mondata_config_read(d, &mon_info); if (mon_info.mon_config == val) - goto out; + return; mon_info.mon_config = val; @@ -1643,7 +1686,7 @@ static int mbm_config_write_domain(struct rdt_resource *r, * are scoped at the domain level. Writing any of these MSRs * on one CPU is observed by all the CPUs in the domain. */ - smp_call_function_any(&d->cpu_mask, mon_event_config_write, + smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write, &mon_info, 1); /* @@ -1656,17 +1699,17 @@ static int mbm_config_write_domain(struct rdt_resource *r, * mbm_local and mbm_total counts for all the RMIDs. */ resctrl_arch_reset_rmid_all(r, d); - -out: - return ret; } static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); char *dom_str = NULL, *id_str; unsigned long dom_id, val; - struct rdt_domain *d; - int ret = 0; + struct rdt_mon_domain *d; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); next: if (!tok || tok[0] == '\0') @@ -1686,11 +1729,16 @@ next: return -EINVAL; } - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { - ret = mbm_config_write_domain(r, d, evtid, val); - if (ret) - return -EINVAL; + /* Value from user cannot be more than the supported set of events */ + if ((val & hw_res->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + hw_res->mbm_cfg_mask); + return -EINVAL; + } + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + mbm_config_write_domain(r, d, evtid, val); goto next; } } @@ -1709,6 +1757,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1718,6 +1767,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1733,6 +1783,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, if (nbytes == 0 || buf[nbytes - 1] != '\n') return -EINVAL; + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); rdt_last_cmd_clear(); @@ -1742,6 +1793,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); return ret ?: nbytes; } @@ -1899,6 +1951,13 @@ static struct rftype res_common_files[] = { .fflags = RFTYPE_CTRL_BASE, }, { + .name = "mba_MBps_event", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mba_mbps_event_write, + .seq_show = rdtgroup_mba_mbps_event_show, + }, + { .name = "mode", .mode = 0644, .kf_ops = &rdtgroup_kf_single_ops, @@ -1977,24 +2036,13 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name) return NULL; } -void __init thread_throttle_mode_init(void) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); - if (!rft) - return; - - rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; -} - -void __init mbm_config_rftype_init(const char *config) +void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; rft = rdtgroup_get_rftype_by_name(config); if (rft) - rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; + rft->fflags = fflags; } /** @@ -2213,11 +2261,14 @@ static inline bool is_mba_linear(void) static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); + struct rdt_ctrl_domain *d; struct rdt_resource *r_l; cpumask_var_t cpu_mask; - struct rdt_domain *d; int cpu; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + if (level == RDT_RESOURCE_L3) update = l3_qos_cfg_update; else if (level == RDT_RESOURCE_L2) @@ -2229,14 +2280,14 @@ static int set_cache_qos_cfg(int level, bool enable) return -ENOMEM; r_l = &rdt_resources_all[level].r_resctrl; - list_for_each_entry(d, &r_l->domains, list) { + list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) { if (r_l->cache.arch_has_per_cpu_cfg) /* Pick all the CPUs in the domain instance */ - for_each_cpu(cpu, &d->cpu_mask) + for_each_cpu(cpu, &d->hdr.cpu_mask) cpumask_set_cpu(cpu, cpu_mask); else /* Pick one CPU from each domain instance to update MSR */ - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask); } /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ @@ -2262,10 +2313,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) { u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->cpu_mask); + int cpu = cpumask_any(&d->hdr.cpu_mask); int i; d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), @@ -2280,7 +2331,7 @@ static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) } static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_domain *d) + struct rdt_ctrl_domain *d) { kfree(d->mbps_val); d->mbps_val = NULL; @@ -2288,14 +2339,18 @@ static void mba_sc_domain_destroy(struct rdt_resource *r, /* * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale. + * MBM is supported and MBA is in linear scale, + * and the MBM monitor scope is the same as MBA + * control scope. */ static bool supports_mba_mbps(void) { + struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - return (is_mbm_local_enabled() && - r->alloc_capable && is_mba_linear()); + return (is_mbm_enabled() && + r->alloc_capable && is_mba_linear() && + r->ctrl_scope == rmbm->mon_scope); } /* @@ -2306,7 +2361,8 @@ static int set_mba_sc(bool mba_sc) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_domain *d; + struct rdt_ctrl_domain *d; + unsigned long fflags; int i; if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) @@ -2314,11 +2370,16 @@ static int set_mba_sc(bool mba_sc) r->membw.mba_sc = mba_sc; - list_for_each_entry(d, &r->domains, list) { + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { for (i = 0; i < num_closid; i++) d->mbps_val[i] = MBA_MAX_MBPS; } + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; + resctrl_file_fflags_init("mba_MBps_event", fflags); + return 0; } @@ -2417,6 +2478,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) rdtgroup_kn_get(rdtgrp, kn); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* Was this group deleted while we waited? */ @@ -2434,6 +2496,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn) return; mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + rdtgroup_kn_put(rdtgrp, kn); } @@ -2575,7 +2639,7 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_domain *dom; + struct rdt_mon_domain *dom; struct rdt_resource *r; int ret; @@ -2584,7 +2648,7 @@ static int rdt_get_tree(struct fs_context *fc) /* * resctrl file system can only be mounted once. */ - if (static_branch_unlikely(&rdt_enable_key)) { + if (resctrl_mounted) { ret = -EBUSY; goto out; } @@ -2605,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc) closid_init(); - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) flags |= RFTYPE_MON; ret = rdtgroup_add_files(rdtgroup_default.kn, flags); @@ -2618,7 +2682,7 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_schemata_free; - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { ret = mongroup_create_dir(rdtgroup_default.kn, &rdtgroup_default, "mon_groups", &kn_mongrp); @@ -2640,18 +2704,19 @@ static int rdt_get_tree(struct fs_context *fc) if (ret < 0) goto out_psl; - if (rdt_alloc_capable) - static_branch_enable_cpuslocked(&rdt_alloc_enable_key); - if (rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_mon_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); - if (rdt_alloc_capable || rdt_mon_capable) - static_branch_enable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; if (is_mbm_enabled()) { r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + list_for_each_entry(dom, &r->mon_domains, hdr.list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } goto out; @@ -2659,10 +2724,10 @@ static int rdt_get_tree(struct fs_context *fc) out_psl: rdt_pseudo_lock_release(); out_mondata: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mondata); out_mongrp: - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) kernfs_remove(kn_mongrp); out_info: kernfs_remove(kn_info); @@ -2699,6 +2764,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct rdt_fs_context *ctx = rdt_fc2context(fc); struct fs_parse_result result; + const char *msg; int opt; opt = fs_parse(fc, rdt_fs_parameters, param, &result); @@ -2713,8 +2779,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->enable_cdpl2 = true; return 0; case Opt_mba_mbps: + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; if (!supports_mba_mbps()) - return -EINVAL; + return invalfc(fc, msg); ctx->enable_mba_mbps = true; return 0; case Opt_debug: @@ -2759,14 +2826,13 @@ static int rdt_init_fs_context(struct fs_context *fc) static int reset_all_ctrls(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; - cpumask_var_t cpu_mask; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int i; - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); msr_param.res = r; msr_param.low = 0; @@ -2774,22 +2840,18 @@ static int reset_all_ctrls(struct rdt_resource *r) /* * Disable resource control for this resource by setting all - * CBMs in all domains to the maximum mask value. Pick one CPU + * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU * from each domain to update the MSRs below. */ - list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) hw_dom->ctrl_val[i] = r->default_ctrl; + msr_param.dom = d; + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - /* Update CBM on all the CPUs in cpu_mask */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - - free_cpumask_var(cpu_mask); - return 0; } @@ -2810,8 +2872,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, for_each_process_thread(p, t) { if (!from || is_closid_match(t, from) || is_rmid_match(t, from)) { - WRITE_ONCE(t->closid, to->closid); - WRITE_ONCE(t->rmid, to->mon.rmid); + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); /* * Order the closid/rmid stores above before the loads @@ -2842,7 +2904,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) head = &rdtgrp->mon.crdtgrp_list; list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->mon.rmid); + free_rmid(sentry->closid, sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); if (atomic_read(&sentry->waitcount) != 0) @@ -2882,7 +2944,7 @@ static void rmdir_all_sub(void) cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); @@ -2917,9 +2979,11 @@ static void rdt_kill_sb(struct super_block *sb) rdtgroup_default.mode = RDT_MODE_SHAREABLE; schemata_list_destroy(); rdtgroup_destroy_root(); - static_branch_disable_cpuslocked(&rdt_alloc_enable_key); - static_branch_disable_cpuslocked(&rdt_mon_enable_key); - static_branch_disable_cpuslocked(&rdt_enable_key); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); cpus_read_unlock(); @@ -2953,62 +3017,126 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name, return ret; } +static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get(pkn, name); + if (!kn) + return; + kernfs_put(kn); + + if (kn->dir.subdirs <= 1) + kernfs_remove(kn); + else + kernfs_remove_by_name(kn, subname); +} + /* * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups with given domain id. + * and monitor groups for the given domain. + * Remove files and directories containing "sum" of domain data + * when last domain being summed is removed. */ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id) + struct rdt_mon_domain *d) { struct rdtgroup *prgrp, *crgrp; + char subname[32]; + bool snc_mode; char name[32]; + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + if (snc_mode) + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - sprintf(name, "mon_%s_%02d", r->name, dom_id); - kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); } } -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp, + bool do_sum) { + struct rmid_read rr = {0}; union mon_data_bits priv; - struct kernfs_node *kn; struct mon_evt *mevt; - struct rmid_read rr; - char name[32]; int ret; - sprintf(name, "mon_%s_%02d", r->name, d->id); - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - if (WARN_ON(list_empty(&r->evt_list))) { - ret = -EPERM; - goto out_destroy; - } + if (WARN_ON(list_empty(&r->evt_list))) + return -EPERM; priv.u.rid = r->rid; - priv.u.domid = d->id; + priv.u.domid = do_sum ? d->ci->id : d->hdr.id; + priv.u.sum = do_sum; list_for_each_entry(mevt, &r->evt_list, list) { priv.u.evtid = mevt->evtid; ret = mon_addfile(kn, mevt->name, priv.priv); if (ret) + return ret; + + if (!do_sum && is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + } + + return 0; +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn, *ckn; + char name[32]; + bool snc_mode; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + kn = kernfs_find_and_get(parent_kn, name); + if (kn) { + /* + * rdtgroup_mutex will prevent this directory from being + * removed. No need to keep this hold. + */ + kernfs_put(kn); + } else { + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + if (ret) goto out_destroy; + } - if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); + if (snc_mode) { + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); + if (IS_ERR(ckn)) { + ret = -EINVAL; + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(ckn); + if (ret) + goto out_destroy; + + ret = mon_add_all_files(ckn, d, r, prgrp, false); + if (ret) + goto out_destroy; } + kernfs_activate(kn); return 0; @@ -3022,7 +3150,7 @@ out_destroy: * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) + struct rdt_mon_domain *d) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3044,10 +3172,13 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_domain *dom; + struct rdt_mon_domain *dom; int ret; - list_for_each_entry(dom, &r->domains, list) { + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); if (ret) return ret; @@ -3146,7 +3277,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) * Set the RDT domain up to start off with all usable allocations. That is, * all shareable and unused bits. All-zero CBM is invalid. */ -static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, +static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, u32 closid) { enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); @@ -3206,7 +3337,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, */ tmp_cbm = cfg->new_ctrl; if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id); + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); return -ENOSPC; } cfg->have_new_ctrl = true; @@ -3226,10 +3357,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, */ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int ret; - list_for_each_entry(d, &s->res->domains, list) { + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { ret = __init_one_rdt_domain(d, s, closid); if (ret < 0) return ret; @@ -3242,9 +3373,9 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; - list_for_each_entry(d, &r->domains, list) { + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (is_mba_sc(r)) { d->mbps_val[closid] = MBA_MAX_MBPS; continue; @@ -3293,6 +3424,36 @@ out: return ret; } +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, const char *name, umode_t mode, enum rdt_group_type rtype, struct rdtgroup **r) @@ -3353,7 +3514,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, if (rtype == RDTCTRL_GROUP) { files = RFTYPE_BASE | RFTYPE_CTRL; - if (rdt_mon_capable) + if (resctrl_arch_mon_capable()) files |= RFTYPE_MON; } else { files = RFTYPE_BASE | RFTYPE_MON; @@ -3365,29 +3526,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, goto out_destroy; } - if (rdt_mon_capable) { - ret = alloc_rmid(); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - goto out_destroy; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_idfree; - } - } - kernfs_activate(kn); - /* * The caller unlocks the parent_kn upon success. */ return 0; -out_idfree: - free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); @@ -3401,7 +3544,6 @@ out_unlock: static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) { kernfs_remove(rgrp->kn); - free_rmid(rgrp->mon.rmid); rdtgroup_remove(rgrp); } @@ -3423,12 +3565,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, prgrp = rdtgrp->mon.parent; rdtgrp->closid = prgrp->closid; + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + /* * Add the rdtgrp to the list of rdtgrps the parent * ctrl_mon group has to track. */ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); +out_unlock: rdtgroup_kn_unlock(parent_kn); return ret; } @@ -3459,13 +3610,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, ret = 0; rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + ret = rdtgroup_init_alloc(rdtgrp); if (ret < 0) - goto out_id_free; + goto out_rmid_free; list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - if (rdt_mon_capable) { + if (resctrl_arch_mon_capable()) { /* * Create an empty mon_groups directory to hold the subset * of tasks and cpus to monitor. @@ -3475,13 +3633,17 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, rdt_last_cmd_puts("kernfs subdir error\n"); goto out_del_list; } + if (is_mba_sc(NULL)) + rdtgrp->mba_mbps_event = mba_mbps_default_event; } goto out_unlock; out_del_list: list_del(&rdtgrp->rdtgroup_list); -out_id_free: +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: closid_free(closid); out_common_fail: mkdir_rdt_prepare_clean(rdtgrp); @@ -3518,14 +3680,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, * allocation is supported, add a control and monitoring * subdirectory */ - if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); /* * If RDT monitoring is supported and the parent directory is a valid * "mon_groups" directory, add a monitoring subdirectory. */ - if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) return rdtgroup_mkdir_mon(parent_kn, name, mode); return -EPERM; @@ -3550,7 +3712,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->mon.rmid); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); /* * Remove the rdtgrp from the parent ctrl_mon group's list @@ -3596,8 +3758,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); update_closid_rmid(tmpmask, NULL); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); closid_free(rdtgrp->closid); - free_rmid(rdtgrp->mon.rmid); rdtgroup_ctrl_remove(rdtgrp); @@ -3829,8 +3991,8 @@ static void __init rdtgroup_setup_default(void) { mutex_lock(&rdtgroup_mutex); - rdtgroup_default.closid = 0; - rdtgroup_default.mon.rmid = 0; + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; rdtgroup_default.type = RDTCTRL_GROUP; INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); @@ -3839,33 +4001,37 @@ static void __init rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_domain *d) +static void domain_destroy_mon_state(struct rdt_mon_domain *d) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); } -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) { - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) mba_sc_domain_destroy(r, d); - if (!r->mon_capable) - return; + mutex_unlock(&rdtgroup_mutex); +} + +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + mutex_lock(&rdtgroup_mutex); /* * If resctrl is mounted, remove all the * per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) - rmdir_mondata_subdir_allrdtgrp(r, d->id); + if (resctrl_mounted && resctrl_arch_mon_capable()) + rmdir_mondata_subdir_allrdtgrp(r, d); if (is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -3879,20 +4045,23 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) } domain_destroy_mon_state(d); + + mutex_unlock(&rdtgroup_mutex); } -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL); + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } if (is_mbm_total_enabled()) { tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { bitmap_free(d->rmid_busy_llc); return -ENOMEM; @@ -3900,7 +4069,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) } if (is_mbm_local_enabled()) { tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { bitmap_free(d->rmid_busy_llc); kfree(d->mbm_total); @@ -3911,36 +4080,106 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) return 0; } -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) { - int err; + int err = 0; - lockdep_assert_held(&rdtgroup_mutex); + mutex_lock(&rdtgroup_mutex); - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { /* RDT_RESOURCE_MBA is never mon_capable */ - return mba_sc_domain_allocate(r, d); + err = mba_sc_domain_allocate(r, d); + } - if (!r->mon_capable) - return 0; + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + int err; + + mutex_lock(&rdtgroup_mutex); err = domain_setup_mon_state(r, d); if (err) - return err; + goto out_unlock; if (is_mbm_enabled()) { INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); } if (is_llc_occupancy_enabled()) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - /* If resctrl is mounted, add per domain monitor data directories. */ - if (static_branch_unlikely(&rdt_mon_enable_key)) + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) mkdir_mondata_subdir_allrdtgrp(r, d); - return 0; +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_mon_domain_from_cpu(cpu, l3); + if (d) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); } /* diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h index 428ebbd4270b..2a506316b303 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h +++ b/arch/x86/kernel/cpu/resctrl/trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PSEUDO_LOCK_H +#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_RESCTRL_H #include <linux/tracepoint.h> @@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -#endif /* _TRACE_PSEUDO_LOCK_H */ +TRACE_EVENT(mon_llc_occupancy_limbo, + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), + TP_STRUCT__entry(__field(u32, ctrl_hw_id) + __field(u32, mon_hw_id) + __field(int, domain_id) + __field(u64, llc_occupancy_bytes)), + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; + __entry->mon_hw_id = mon_hw_id; + __entry->domain_id = domain_id; + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, + __entry->llc_occupancy_bytes) + ); + +#endif /* _TRACE_RESCTRL_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE pseudo_lock_event +#define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 0dad49a09b7a..16f3ca30626a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,31 +24,36 @@ struct cpuid_bit { * levels are different and there is a separate entry for each. */ static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, - { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, - { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, - { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, - { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, - { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, - { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, - { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, - { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, - { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, - { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, - { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, - { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, - { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, - { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, - { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, - { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, + { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 262f5fb18d74..7f8d1e11dbee 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT @@ -150,13 +150,15 @@ int __init sgx_drv_init(void) u64 xfrm_mask; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; + } cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); if (!(eax & 1)) { - pr_err("SGX disabled: SGX1 instruction support not available.\n"); + pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; } @@ -173,8 +175,10 @@ int __init sgx_drv_init(void) } ret = misc_register(&sgx_dev_enclave); - if (ret) + if (ret) { + pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret); return ret; + } return 0; } diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 5d390df21440..776a20172867 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -64,6 +64,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; + /* + * ECREATE would detect this too, but checking here also ensures + * that the 'encl_size' calculations below can never overflow. + */ + if (!is_power_of_2(secs->size)) + return -EINVAL; + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); @@ -581,7 +588,7 @@ err_out: * * Flush any outstanding enqueued EADD operations and perform EINIT. The * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match - * the enclave's MRSIGNER, which is caculated from the provided sigstruct. + * the enclave's MRSIGNER, which is calculated from the provided sigstruct. * * Return: * - 0: Success. diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..8ce352fc72ac 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -13,6 +13,7 @@ #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/sysfs.h> +#include <linux/vmalloc.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -474,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void) { struct sgx_epc_page *page; int nid_of_current = numa_node_id(); - int nid = nid_of_current; + int nid_start, nid; - if (node_isset(nid_of_current, sgx_numa_mask)) { - page = __sgx_alloc_epc_page_from_node(nid_of_current); - if (page) - return page; - } - - /* Fall back to the non-local NUMA nodes: */ - while (true) { - nid = next_node_in(nid, sgx_numa_mask); - if (nid == nid_of_current) - break; + /* + * Try local node first. If it doesn't have an EPC section, + * fall back to the non-local NUMA nodes. + */ + if (node_isset(nid_of_current, sgx_numa_mask)) + nid_start = nid_of_current; + else + nid_start = next_node_in(nid_of_current, sgx_numa_mask); + nid = nid_start; + do { page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; - } + + nid = next_node_in(nid, sgx_numa_mask); + } while (nid != nid_start); return ERR_PTR(-ENOMEM); } @@ -628,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, if (!section->virt_addr) return false; - section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); if (!section->pages) { memunmap(section->virt_addr); return false; @@ -731,7 +733,7 @@ out: return 0; } -/** +/* * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the * metric. @@ -846,6 +848,13 @@ static bool __init sgx_page_cache_init(void) return false; } + for_each_online_node(nid) { + if (!node_isset(nid, sgx_numa_mask) && + node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) + pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", + nid); + } + return true; } @@ -892,19 +901,15 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct fd f = fdget(attribute_fd); + CLASS(fd, f)(attribute_fd); - if (!f.file) + if (fd_empty(f)) return -EINVAL; - if (f.file->f_op != &sgx_provision_fops) { - fdput(f); + if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; - } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - - fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index dc136703566f..01456236a6dd 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,167 +1,571 @@ -// SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0-only /* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. + * CPU/APIC topology + * + * The APIC IDs describe the system topology in multiple domain levels. + * The CPUID topology parser provides the information which part of the + * APIC ID is associated to the individual levels: + * + * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD] + * + * The root space contains the package (socket) IDs. + * + * Not enumerated levels consume 0 bits space, but conceptually they are + * always represented. If e.g. only CORE and THREAD levels are enumerated + * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE. + * + * If SMT is not supported, then the THREAD domain is still used. It then + * has the same physical ID as the CORE domain and is the only child of + * the core domain. + * + * This allows a unified view on the system independent of the enumerated + * domain levels without requiring any conditionals in the code. */ - +#define pr_fmt(fmt) "CPU topo: " fmt #include <linux/cpu.h> + +#include <xen/xen.h> + #include <asm/apic.h> -#include <asm/memtype.h> -#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/io_apic.h> +#include <asm/mpspec.h> +#include <asm/smp.h> #include "cpu.h" -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); -/* extended topology sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 -#define DIE_TYPE 5 +/* Bitmap of physically present CPUs. */ +DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly; -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) +/* Used for CPU number allocation and parallel CPU bringup */ +u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, }; -unsigned int __max_die_per_package __read_mostly = 1; -EXPORT_SYMBOL(__max_die_per_package); +/* Bitmaps to mark registered APICs at each topology domain */ +static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init; -#ifdef CONFIG_SMP /* - * Check if given CPUID extended topology "leaf" is implemented + * Keep track of assigned, disabled and rejected CPUs. Present assigned + * with 1 as CPU #0 is reserved for the boot CPU. */ -static int check_extended_topology_leaf(int leaf) -{ - unsigned int eax, ebx, ecx, edx; +static struct { + unsigned int nr_assigned_cpus; + unsigned int nr_disabled_cpus; + unsigned int nr_rejected_cpus; + u32 boot_cpu_apic_id; + u32 real_bsp_apic_id; +} topo_info __ro_after_init = { + .nr_assigned_cpus = 1, + .boot_cpu_apic_id = BAD_APICID, + .real_bsp_apic_id = BAD_APICID, +}; - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); +#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC) - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return -1; +bool arch_match_cpu_phys_id(int cpu, u64 phys_id) +{ + return phys_id == (u64)cpuid_to_apicid[cpu]; +} - return 0; +#ifdef CONFIG_SMP +static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) +{ + if (!(apicid & (__max_threads_per_core - 1))) + cpumask_set_cpu(cpu, &__cpu_primary_thread_mask); } +#else +static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { } +#endif + /* - * Return best CPUID Extended Topology Leaf supported + * Convert the APIC ID to a domain level ID by masking out the low bits + * below the domain level @dom. */ -static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom) { - if (c->cpuid_level >= 0x1f) { - if (check_extended_topology_leaf(0x1f) == 0) - return 0x1f; - } + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]); +} - if (c->cpuid_level >= 0xb) { - if (check_extended_topology_leaf(0xb) == 0) - return 0xb; - } +static int topo_lookup_cpuid(u32 apic_id) +{ + int i; - return -1; + /* CPU# to APICID mapping is persistent once it is established */ + for (i = 0; i < topo_info.nr_assigned_cpus; i++) { + if (cpuid_to_apicid[i] == apic_id) + return i; + } + return -ENODEV; } -#endif -int detect_extended_topology_early(struct cpuinfo_x86 *c) +static __init int topo_get_cpunr(u32 apic_id) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx; - int leaf; + int cpu = topo_lookup_cpuid(apic_id); - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + if (cpu >= 0) + return cpu; - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + return topo_info.nr_assigned_cpus++; +} - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - /* - * initial apic id, which also represents 32-bit extended x2apic id. - */ - c->topo.initial_apicid = edx; - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); +static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - return 0; + set_cpu_present(cpu, true); } -/* - * Check for extended topology enumeration cpuid leaf, and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -int detect_extended_topology(struct cpuinfo_x86 *c) +static __init bool check_for_real_bsp(u32 apic_id) { -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - unsigned int die_select_mask, die_level_siblings; - unsigned int pkg_mask_width; - bool die_level_present = false; - int leaf; - - leaf = detect_extended_topology_leaf(c); - if (leaf < 0) - return -1; + bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6; + u64 msr; /* - * Populate HT related information from sub-leaf level 0. + * There is no real good way to detect whether this a kdump() + * kernel, but except on the Voyager SMP monstrosity which is not + * longer supported, the real BSP APIC ID is the first one which is + * enumerated by firmware. That allows to detect whether the boot + * CPU is the real BSP. If it is not, then do not register the APIC + * because sending INIT to the real BSP would reset the whole + * system. + * + * The first APIC ID which is enumerated by firmware is detectable + * because the boot CPU APIC ID is registered before that without + * invoking this code. */ - cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - c->topo.initial_apicid = edx; - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx)); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - while (true) { - cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); + if (topo_info.real_bsp_apic_id != BAD_APICID) + return false; + /* + * Check whether the enumeration order is broken by evaluating the + * BSP bit in the APICBASE MSR. If the CPU does not have the + * APICBASE MSR then the BSP detection is not possible and the + * kernel must rely on the firmware enumeration order. + */ + if (has_apic_base) { + rdmsrl(MSR_IA32_APICBASE, msr); + is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); + } + + if (apic_id == topo_info.boot_cpu_apic_id) { /* - * Check for the Core type in the implemented sub leaves. + * If the boot CPU has the APIC BSP bit set then the + * firmware enumeration is agreeing. If the CPU does not + * have the APICBASE MSR then the only choice is to trust + * the enumeration order. */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - die_level_siblings = core_level_siblings; - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - } - if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { - die_level_present = true; - die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + if (is_bsp || !has_apic_base) { + topo_info.real_bsp_apic_id = apic_id; + return false; } + /* + * If the boot APIC is enumerated first, but the APICBASE + * MSR does not have the BSP bit set, then there is no way + * to discover the real BSP here. Assume a crash kernel and + * limit the number of CPUs to 1 as an INIT to the real BSP + * would reset the machine. + */ + pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id); + pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n"); + set_nr_cpu_ids(1); + goto fwbug; + } - if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) - pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - else - break; + pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n", + topo_info.boot_cpu_apic_id, apic_id); - sub_index++; + if (is_bsp) { + /* + * The boot CPU has the APIC BSP bit set. Use it and complain + * about the broken firmware enumeration. + */ + topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id; + goto fwbug; } - core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; - die_select_mask = (~(-1 << die_plus_mask_width)) >> - core_plus_mask_width; + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); + + topo_info.real_bsp_apic_id = apic_id; + return true; + +fwbug: + pr_warn(FW_BUG "APIC enumeration order not specification compliant\n"); + return false; +} + +static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, + unsigned long *map) +{ + unsigned int id, end, cnt = 0; + + /* Calculate the exclusive end */ + end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]); + + /* Unfortunately there is no bitmap_weight_range() */ + for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id)) + cnt++; + return cnt; +} + +static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + int cpu, dom; + + if (present) { + set_bit(apic_id, phys_cpu_present_map); + + /* + * Double registration is valid in case of the boot CPU + * APIC because that is registered before the enumeration + * of the APICs via firmware parsers or VM guest + * mechanisms. + */ + if (apic_id == topo_info.boot_cpu_apic_id) + cpu = 0; + else + cpu = topo_get_cpunr(apic_id); + + cpuid_to_apicid[cpu] = apic_id; + topo_set_cpuids(cpu, apic_id, acpi_id); + } else { + u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN); - c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, - ht_mask_width) & core_select_mask; + /* + * Check for present APICs in the same package when running + * on bare metal. Allow the bogosity in a guest. + */ + if (hypervisor_is_type(X86_HYPER_NATIVE) && + topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) { + pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n", + apic_id); + topo_info.nr_rejected_cpus++; + return; + } - if (die_level_present) { - c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid, - core_plus_mask_width) & die_select_mask; + topo_info.nr_disabled_cpus++; } - c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width); /* - * Reinit the apicid, now that we have extended initial_apicid. + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. */ - c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0); + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) + set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); +} + +/** + * topology_register_apic - Register an APIC in early topology maps + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + * @present: True if the corresponding CPU is present + */ +void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present) +{ + if (apic_id >= MAX_LOCAL_APIC) { + pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1); + topo_info.nr_rejected_cpus++; + return; + } + + if (check_for_real_bsp(apic_id)) { + topo_info.nr_rejected_cpus++; + return; + } + + /* CPU numbers exhausted? */ + if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) { + pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids); + topo_info.nr_rejected_cpus++; + return; + } + + topo_register_apic(apic_id, acpi_id, present); +} + +/** + * topology_register_boot_apic - Register the boot CPU APIC + * @apic_id: The APIC ID to set up + * + * Separate so CPU #0 can be assigned + */ +void __init topology_register_boot_apic(u32 apic_id) +{ + WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID); + + topo_info.boot_cpu_apic_id = apic_id; + topo_register_apic(apic_id, CPU_ACPIID_INVALID, true); +} - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - __max_die_per_package = (die_level_siblings / core_level_siblings); +/** + * topology_get_logical_id - Retrieve the logical ID at a given topology domain level + * @apicid: The APIC ID for which to lookup the logical ID + * @at_level: The topology domain level to use + * + * @apicid must be a full APIC ID, not the normalized variant. It's valid to have + * all bits below the domain level specified by @at_level to be clear. So both + * real APIC IDs and backshifted normalized APIC IDs work correctly. + * + * Returns: + * - >= 0: The requested logical ID + * - -ERANGE: @apicid is out of range + * - -ENODEV: @apicid is not registered + */ +int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return -ERANGE; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return -ENODEV; + /* Get the number of set bits before @lvlid. */ + return bitmap_weight(apic_maps[at_level].map, lvlid); +} +EXPORT_SYMBOL_GPL(topology_get_logical_id); + +/** + * topology_unit_count - Retrieve the count of specified units at a given topology domain level + * @apicid: The APIC ID which specifies the search range + * @which_units: The domain level specifying the units to count + * @at_level: The domain level at which @which_units have to be counted + * + * This returns the number of possible units according to the enumerated + * information. + * + * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN) + * counts the number of possible cores in the package to which @apicid + * belongs. + * + * @at_level must obviously be greater than @which_level to produce useful + * results. If @at_level is equal to @which_units the result is + * unsurprisingly 1. If @at_level is less than @which_units the results + * is by definition undefined and the function returns 0. + */ +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + /* Remove the bits below @at_level to get the proper level ID of @apicid */ + unsigned int lvlid = topo_apicid(apicid, at_level); + + if (lvlid >= MAX_LOCAL_APIC) + return 0; + if (!test_bit(lvlid, apic_maps[at_level].map)) + return 0; + if (which_units > at_level) + return 0; + if (which_units == at_level) + return 1; + return topo_unit_count(lvlid, at_level, apic_maps[which_units].map); +} + +#ifdef CONFIG_ACPI_HOTPLUG_CPU +/** + * topology_hotplug_apic - Handle a physical hotplugged APIC after boot + * @apic_id: The APIC ID to set up + * @acpi_id: The ACPI ID associated to the APIC + */ +int topology_hotplug_apic(u32 apic_id, u32 acpi_id) +{ + int cpu; + + if (apic_id >= MAX_LOCAL_APIC) + return -EINVAL; + + /* Reject if the APIC ID was not registered during enumeration. */ + if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map)) + return -ENODEV; + + cpu = topo_lookup_cpuid(apic_id); + if (cpu < 0) + return -ENOSPC; + + set_bit(apic_id, phys_cpu_present_map); + topo_set_cpuids(cpu, apic_id, acpi_id); + cpu_mark_primary_thread(cpu, apic_id); + return cpu; +} + +/** + * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot + * @cpu: The CPU number for which the APIC ID is removed + */ +void topology_hotunplug_apic(unsigned int cpu) +{ + u32 apic_id = cpuid_to_apicid[cpu]; + + if (apic_id == BAD_APICID) + return; + + per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; + clear_bit(apic_id, phys_cpu_present_map); + set_cpu_present(cpu, false); +} #endif + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned int max_possible_cpus __initdata = NR_CPUS; + +/** + * topology_apply_cmdline_limits_early - Apply topology command line limits early + * + * Ensure that command line limits are in effect before firmware parsing + * takes place. + */ +void __init topology_apply_cmdline_limits_early(void) +{ + unsigned int possible = nr_cpu_ids; + + /* 'maxcpus=0' 'nosmp' 'nolapic' */ + if (!setup_max_cpus || apic_is_disabled) + possible = 1; + + /* 'possible_cpus=N' */ + possible = min_t(unsigned int, max_possible_cpus, possible); + + if (possible < nr_cpu_ids) { + pr_info("Limiting to %u possible CPUs\n", possible); + set_nr_cpu_ids(possible); + } +} + +static __init bool restrict_to_up(void) +{ + if (!smp_found_config) + return true; + /* + * XEN PV is special as it does not advertise the local APIC + * properly, but provides a fake topology for it so that the + * infrastructure works. So don't apply the restrictions vs. APIC + * here. + */ + if (xen_pv_domain()) + return false; + + return apic_is_disabled; +} + +void __init topology_init_possible_cpus(void) +{ + unsigned int assigned = topo_info.nr_assigned_cpus; + unsigned int disabled = topo_info.nr_disabled_cpus; + unsigned int cnta, cntb, cpu, allowed = 1; + unsigned int total = assigned + disabled; + u32 apicid, firstid; + + /* + * If there was no APIC registered, then fake one so that the + * topology bitmap is populated. That ensures that the code below + * is valid and the various query interfaces can be used + * unconditionally. This does not affect the actual APIC code in + * any way because either the local APIC address has not been + * registered or the local APIC was disabled on the command line. + */ + if (topo_info.boot_cpu_apic_id == BAD_APICID) + topology_register_boot_apic(0); + + if (!restrict_to_up()) { + if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { + disabled += assigned - nr_cpu_ids; + assigned = nr_cpu_ids; + } + allowed = min_t(unsigned int, total, nr_cpu_ids); + } + + if (total > allowed) + pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed); + + assigned = min_t(unsigned int, allowed, assigned); + disabled = allowed - assigned; + + topo_info.nr_assigned_cpus = assigned; + topo_info.nr_disabled_cpus = disabled; + + total_cpus = allowed; + set_nr_cpu_ids(allowed); + + cnta = domain_weight(TOPO_PKG_DOMAIN); + cntb = domain_weight(TOPO_DIE_DOMAIN); + __max_logical_packages = cnta; + __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); + + pr_info("Max. logical packages: %3u\n", cnta); + pr_info("Max. logical dies: %3u\n", cntb); + pr_info("Max. dies per package: %3u\n", __max_dies_per_package); + + cnta = domain_weight(TOPO_CORE_DOMAIN); + cntb = domain_weight(TOPO_SMT_DOMAIN); + /* + * Can't use order delta here as order(cnta) can be equal + * order(cntb) even if cnta != cntb. + */ + __max_threads_per_core = DIV_ROUND_UP(cntb, cnta); + pr_info("Max. threads per core: %3u\n", __max_threads_per_core); + + firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC); + __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. cores per package: %3u\n", __num_cores_per_package); + __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN); + pr_info("Num. threads per package: %3u\n", __num_threads_per_package); + + pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled); + if (topo_info.nr_rejected_cpus) + pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus); + + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + /* Assign CPU numbers to non-present CPUs */ + for (apicid = 0; disabled; disabled--, apicid++) { + apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map, + MAX_LOCAL_APIC, apicid); + if (apicid >= MAX_LOCAL_APIC) + break; + cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid; + } + + for (cpu = 0; cpu < allowed; cpu++) { + apicid = cpuid_to_apicid[cpu]; + + set_cpu_possible(cpu, true); + + if (apicid == BAD_APICID) + continue; + + cpu_mark_primary_thread(cpu, apicid); + set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map)); + } +} + +/* + * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed. + */ +void __init topology_reset_possible_cpus_up(void) +{ + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + + bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC); + if (topo_info.boot_cpu_apic_id != BAD_APICID) + set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map); +} + +static int __init setup_possible_cpus(char *str) +{ + get_option(&str, &max_possible_cpus); return 0; } +early_param("possible_cpus", setup_possible_cpus); +#endif diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h new file mode 100644 index 000000000000..37326297f80c --- /dev/null +++ b/arch/x86/kernel/cpu/topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef ARCH_X86_TOPOLOGY_H +#define ARCH_X86_TOPOLOGY_H + +struct topo_scan { + struct cpuinfo_x86 *c; + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_ncpus[TOPO_MAX_DOMAIN]; + + /* Legacy CPUID[1]:EBX[23:16] number of logical processors */ + unsigned int ebx1_nproc_shift; + + /* AMD specific node ID which cannot be mapped into APIC space. */ + u16 amd_nodes_per_pkg; + u16 amd_node_id; +}; + +void cpu_init_topology(struct cpuinfo_x86 *c); +void cpu_parse_topology(struct cpuinfo_x86 *c); +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus); +bool cpu_parse_topology_ext(struct topo_scan *tscan); +void cpu_parse_topology_amd(struct topo_scan *tscan); +void cpu_topology_fixup_amd(struct topo_scan *tscan); + +static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom) +{ + if (dom == TOPO_SMT_DOMAIN) + return apicid; + return apicid >> x86_topo_system.dom_shifts[dom - 1]; +} + +static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom) +{ + if (dom != TOPO_SMT_DOMAIN) + apicid >>= x86_topo_system.dom_shifts[dom - 1]; + return apicid & (x86_topo_system.dom_size[dom] - 1); +} + +static inline u32 topo_domain_mask(enum x86_topology_domains dom) +{ + return (1U << x86_topo_system.dom_shifts[dom]) - 1; +} + +/* + * Update a domain level after the fact without propagating. Used to fixup + * broken CPUID enumerations. + */ +static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + tscan->dom_shifts[dom] = shift; + tscan->dom_ncpus[dom] = ncpus; +} + +#ifdef CONFIG_X86_LOCAL_APIC +unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level); +#else +static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units, + enum x86_topology_domains at_level) +{ + return 1; +} +#endif + +#endif /* ARCH_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c new file mode 100644 index 000000000000..03b3c9c3a45e --- /dev/null +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +static bool parse_8000_0008(struct topo_scan *tscan) +{ + struct { + // ecx + u32 cpu_nthreads : 8, // Number of physical threads - 1 + : 4, // Reserved + apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID + perf_tsc_len : 2, // Performance time-stamp counter size + : 14; // Reserved + } ecx; + unsigned int sft; + + if (tscan->c->extended_cpuid_level < 0x80000008) + return false; + + cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx); + + /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */ + sft = ecx.apicid_coreid_len; + if (!sft) + sft = get_count_order(ecx.cpu_nthreads + 1); + + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); + return true; +} + +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) +{ + /* + * Starting with Fam 17h the DIE domain could probably be used to + * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps + * suggests it's the topmost bit(s) of the CPU cores area, but + * that's guess work and neither enumerated nor documented. + * + * Up to Fam 16h this does not work at all and the legacy node ID + * has to be used. + */ + tscan->amd_nodes_per_pkg = nr_nodes; + tscan->amd_node_id = node_id; +} + +static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +{ + struct { + // eax + u32 ext_apic_id : 32; // Extended APIC ID + // ebx + u32 core_id : 8, // Unique per-socket logical core unit ID + core_nthreads : 8, // #Threads per core (zero-based) + : 16; // Reserved + // ecx + u32 node_id : 8, // Node (die) ID of invoking logical CPU + nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket + : 21; // Reserved + // edx + u32 : 32; // Reserved + } leaf; + + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) + return false; + + cpuid_leaf(0x8000001e, &leaf); + + tscan->c->topo.initial_apicid = leaf.ext_apic_id; + + /* + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. Only valid for family >= 0x17. + */ + if (!has_topoext && tscan->c->x86 >= 0x17) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ + unsigned int nthreads = leaf.core_nthreads + 1; + + topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); + } + + store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id); + + if (tscan->c->x86_vendor == X86_VENDOR_AMD) { + if (tscan->c->x86 == 0x15) + tscan->c->topo.cu_id = leaf.core_id; + + cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id); + } else { + /* + * Package ID is ApicId[6..] on certain Hygon CPUs. See + * commit e0ceeae708ce for explanation. The topology info + * is screwed up: The package shift is always 6 and the + * node ID is bit [4:5]. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) { + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6, + tscan->dom_ncpus[TOPO_CORE_DOMAIN]); + } + cacheinfo_hygon_init_llc_id(tscan->c); + } + return true; +} + +static void parse_fam10h_node_id(struct topo_scan *tscan) +{ + union { + struct { + u64 node_id : 3, + nodes_per_pkg : 3, + unused : 58; + }; + u64 msr; + } nid; + + if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) + return; + + rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); + tscan->c->topo.llc_id = nid.node_id; +} + +static void legacy_set_llc(struct topo_scan *tscan) +{ + unsigned int apicid = tscan->c->topo.initial_apicid; + + /* If none of the parsers set LLC ID then use the die ID for it. */ + if (tscan->c->topo.llc_id == BAD_APICID) + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrl(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } +} + +static void parse_topology_amd(struct topo_scan *tscan) +{ + bool has_topoext = false; + + /* + * If the extended topology leaf 0x8000_001e is available + * try to get SMT, CORE, TILE, and DIE shifts from extended + * CPUID leaf 0x8000_0026 on supported processors first. If + * extended CPUID leaf 0x8000_0026 is not supported, try to + * get SMT and CORE shift from leaf 0xb first, then try to + * get the CORE shift from leaf 0x8000_0008. + */ + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) + has_topoext = cpu_parse_topology_ext(tscan); + + if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + + if (!has_topoext && !parse_8000_0008(tscan)) + return; + + /* Prefer leaf 0x8000001e if available */ + if (parse_8000_001e(tscan, has_topoext)) + return; + + /* Try the NODEID MSR */ + parse_fam10h_node_id(tscan); +} + +void cpu_parse_topology_amd(struct topo_scan *tscan) +{ + tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); + parse_topology_amd(tscan); + legacy_set_llc(tscan); + + if (tscan->amd_nodes_per_pkg > 1) + set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); +} + +void cpu_topology_fixup_amd(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + + /* + * Adjust the core_id relative to the node when there is more than + * one node. + */ + if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1) + c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c new file mode 100644 index 000000000000..b5a5e1411469 --- /dev/null +++ b/arch/x86/kernel/cpu/topology_common.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <xen/xen.h> + +#include <asm/intel-family.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/smp.h> + +#include "cpu.h" + +struct x86_topology_system x86_topo_system __ro_after_init; +EXPORT_SYMBOL_GPL(x86_topo_system); + +unsigned int __amd_nodes_per_pkg __ro_after_init; +EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg); + +void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + unsigned int shift, unsigned int ncpus) +{ + topology_update_dom(tscan, dom, shift, ncpus); + + /* Propagate to the upper levels */ + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1]; + tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1]; + } +} + +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_INTEL) { + switch (c->topo.intel_type) { + case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY; + case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE; + } + } + if (c->x86_vendor == X86_VENDOR_AMD) { + switch (c->topo.amd_type) { + case 0: return TOPO_CPU_TYPE_PERFORMANCE; + case 1: return TOPO_CPU_TYPE_EFFICIENCY; + } + } + + return TOPO_CPU_TYPE_UNKNOWN; +} + +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c) +{ + switch (get_topology_cpu_type(c)) { + case TOPO_CPU_TYPE_PERFORMANCE: + return "performance"; + case TOPO_CPU_TYPE_EFFICIENCY: + return "efficiency"; + default: + return "unknown"; + } +} + +static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) +{ + struct { + u32 cache_type : 5, + unused : 21, + ncores : 6; + } eax; + + if (c->cpuid_level < 4) + return 1; + + cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax); + if (!eax.cache_type) + return 1; + + return eax.ncores + 1; +} + +static void parse_legacy(struct topo_scan *tscan) +{ + unsigned int cores, core_shift, smt_shift = 0; + struct cpuinfo_x86 *c = tscan->c; + + cores = parse_num_cores_legacy(c); + core_shift = get_count_order(cores); + + if (cpu_has(c, X86_FEATURE_HT)) { + if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift)) + smt_shift = tscan->ebx1_nproc_shift - core_shift; + /* + * The parser expects leaf 0xb/0x1f format, which means + * the number of logical processors at core level is + * counting threads. + */ + core_shift += smt_shift; + cores <<= smt_shift; + } + + topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores); +} + +static bool fake_topology(struct topo_scan *tscan) +{ + /* + * Preset the CORE level shift for CPUID less systems and XEN_PV, + * which has useless CPUID information. + */ + topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1); + + return tscan->c->cpuid_level < 1; +} + +static void parse_topology(struct topo_scan *tscan, bool early) +{ + const struct cpuinfo_topology topo_defaults = { + .cu_id = 0xff, + .llc_id = BAD_APICID, + .l2c_id = BAD_APICID, + .cpu_type = TOPO_CPU_TYPE_UNKNOWN, + }; + struct cpuinfo_x86 *c = tscan->c; + struct { + u32 unused0 : 16, + nproc : 8, + apicid : 8; + } ebx; + + c->topo = topo_defaults; + + if (fake_topology(tscan)) + return; + + /* Preset Initial APIC ID from CPUID leaf 1 */ + cpuid_leaf_reg(1, CPUID_EBX, &ebx); + c->topo.initial_apicid = ebx.apicid; + + /* + * The initial invocation from early_identify_cpu() happens before + * the APIC is mapped or X2APIC enabled. For establishing the + * topology, that's not required. Use the initial APIC ID. + */ + if (early) + c->topo.apicid = c->topo.initial_apicid; + else + c->topo.apicid = read_apic_id(); + + /* The above is sufficient for UP */ + if (!IS_ENABLED(CONFIG_SMP)) + return; + + tscan->ebx1_nproc_shift = get_count_order(ebx.nproc); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (IS_ENABLED(CONFIG_CPU_SUP_AMD)) + cpu_parse_topology_amd(tscan); + break; + case X86_VENDOR_CENTAUR: + case X86_VENDOR_ZHAOXIN: + parse_legacy(tscan); + break; + case X86_VENDOR_INTEL: + if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) + parse_legacy(tscan); + if (c->cpuid_level >= 0x1a) + c->topo.cpu_type = cpuid_eax(0x1a); + break; + case X86_VENDOR_HYGON: + if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) + cpu_parse_topology_amd(tscan); + break; + } +} + +static void topo_set_ids(struct topo_scan *tscan, bool early) +{ + struct cpuinfo_x86 *c = tscan->c; + u32 apicid = c->topo.apicid; + + c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); + c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); + + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); + } + + /* Package relative core ID */ + c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> + x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN]; + + c->topo.amd_node_id = tscan->amd_node_id; + + if (c->x86_vendor == X86_VENDOR_AMD) + cpu_topology_fixup_amd(tscan); +} + +void cpu_parse_topology(struct cpuinfo_x86 *c) +{ + unsigned int dom, cpu = smp_processor_id(); + struct topo_scan tscan = { .c = c, }; + + parse_topology(&tscan, false); + + if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { + if (c->topo.initial_apicid != c->topo.apicid) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n", + cpu, c->topo.initial_apicid, c->topo.apicid); + } + + if (c->topo.apicid != cpuid_to_apicid[cpu]) { + pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n", + cpu, cpuid_to_apicid[cpu], c->topo.apicid); + } + } + + for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) { + if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom]) + continue; + pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom, + tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); + } + + topo_set_ids(&tscan, false); +} + +void __init cpu_init_topology(struct cpuinfo_x86 *c) +{ + struct topo_scan tscan = { .c = c, }; + unsigned int dom, sft; + + parse_topology(&tscan, true); + + /* Copy the shift values and calculate the unit sizes. */ + memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts)); + + dom = TOPO_SMT_DOMAIN; + x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom]; + + for (dom++; dom < TOPO_MAX_DOMAIN; dom++) { + sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1]; + x86_topo_system.dom_size[dom] = 1U << sft; + } + + topo_set_ids(&tscan, true); + + /* + * AMD systems have Nodes per package which cannot be mapped to + * APIC ID. + */ + __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg; +} diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c new file mode 100644 index 000000000000..467b0326bf1a --- /dev/null +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/cpu.h> + +#include <asm/apic.h> +#include <asm/memtype.h> +#include <asm/processor.h> + +#include "cpu.h" + +enum topo_types { + INVALID_TYPE = 0, + SMT_TYPE = 1, + CORE_TYPE = 2, + MAX_TYPE_0B = 3, + MODULE_TYPE = 3, + AMD_CCD_TYPE = 3, + TILE_TYPE = 4, + AMD_SOCKET_TYPE = 4, + MAX_TYPE_80000026 = 5, + DIE_TYPE = 5, + DIEGRP_TYPE = 6, + MAX_TYPE_1F = 7, +}; + +/* + * Use a lookup table for the case that there are future types > 6 which + * describe an intermediate domain level which does not exist today. + */ +static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [MODULE_TYPE] = TOPO_MODULE_DOMAIN, + [TILE_TYPE] = TOPO_TILE_DOMAIN, + [DIE_TYPE] = TOPO_DIE_DOMAIN, + [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, +}; + +static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN, + [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN, +}; + +static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, + unsigned int *last_dom) +{ + unsigned int dom, maxtype; + const unsigned int *map; + struct { + // eax + u32 x2apic_shift : 5, // Number of bits to shift APIC ID right + // for the topology ID at the next level + : 27; // Reserved + // ebx + u32 num_processors : 16, // Number of processors at current level + : 16; // Reserved + // ecx + u32 level : 8, // Current topology level. Same as sub leaf number + type : 8, // Level type. If 0, invalid + : 16; // Reserved + // edx + u32 x2apic_id : 32; // X2APIC ID of the current logical processor + } sl; + + switch (leaf) { + case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; + case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break; + default: return false; + } + + cpuid_subleaf(leaf, subleaf, &sl); + + if (!sl.num_processors || sl.type == INVALID_TYPE) + return false; + + if (sl.type >= maxtype) { + pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n", + leaf, subleaf, sl.type); + /* + * It really would have been too obvious to make the domain + * type space sparse and leave a few reserved types between + * the points which might change instead of following the + * usual "this can be fixed in software" principle. + */ + dom = *last_dom + 1; + } else { + dom = map[sl.type]; + *last_dom = dom; + } + + if (!dom) { + tscan->c->topo.initial_apicid = sl.x2apic_id; + } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) { + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n", + leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id); + } + + topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors); + return true; +} + +static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf) +{ + unsigned int last_dom; + u32 subleaf; + + /* Read all available subleafs and populate the levels */ + for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++); + + /* If subleaf 0 failed to parse, give up */ + if (!subleaf) + return false; + + /* + * There are machines in the wild which have shift 0 in the subleaf + * 0, but advertise 2 logical processors at that level. They are + * truly SMT. + */ + if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) { + unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + + pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n", + leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]); + } + + set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY); + return true; +} + +bool cpu_parse_topology_ext(struct topo_scan *tscan) +{ + /* Intel: Try leaf 0x1F first. */ + if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) + return true; + + /* AMD: Try leaf 0x80000026 first. */ + if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026)) + return true; + + /* Intel/AMD: Fall back to leaf 0xB if available */ + return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 11f83d07925e..cb3f900c46fc 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/clocksource.h> #include <linux/cpu.h> +#include <linux/efi.h> #include <linux/reboot.h> #include <linux/static_call.h> #include <asm/div64.h> @@ -41,80 +42,97 @@ #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define CPUID_VMWARE_FEATURES_LEAF 0x40000010 -#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) -#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) -#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 - -#define VMWARE_CMD_GETVERSION 10 -#define VMWARE_CMD_GETHZ 45 -#define VMWARE_CMD_GETVCPU_INFO 68 -#define VMWARE_CMD_LEGACY_X2APIC 3 -#define VMWARE_CMD_VCPU_RESERVED 31 -#define VMWARE_CMD_STEALCLOCK 91 +#define GETVCPU_INFO_LEGACY_X2APIC BIT(3) +#define GETVCPU_INFO_VCPU_RESERVED BIT(31) #define STEALCLOCK_NOT_AVAILABLE (-1) #define STEALCLOCK_DISABLED 0 #define STEALCLOCK_ENABLED 1 -#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx), %%eax" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ - __asm__("vmcall" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(0), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ - __asm__("vmmcall" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(0), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ - switch (vmware_hypercall_mode) { \ - case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ - VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ - break; \ - case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ - VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ - break; \ - default: \ - VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ - break; \ - } \ - } while (0) - struct vmware_steal_time { union { - uint64_t clock; /* stolen time counter in units of vtsc */ + u64 clock; /* stolen time counter in units of vtsc */ struct { /* only for little-endian */ - uint32_t clock_low; - uint32_t clock_high; + u32 clock_low; + u32 clock_high; }; }; - uint64_t reserved[7]; + u64 reserved[7]; }; static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; +unsigned long vmware_hypercall_slow(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5) +{ + unsigned long out0, rbx, rcx, rdx, rsi, rdi; + + switch (vmware_hypercall_mode) { + case CPUID_VMWARE_FEATURES_ECX_VMCALL: + asm_inline volatile ("vmcall" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: + asm_inline volatile ("vmmcall" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + default: + asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + } + + if (out1) + *out1 = rbx; + if (out2) + *out2 = rcx; + if (out3) + *out3 = rdx; + if (out4) + *out4 = rsi; + if (out5) + *out5 = rdi; + + return out0; +} + static inline int __vmware_platform(void) { - uint32_t eax, ebx, ecx, edx; - VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); - return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; + u32 eax, ebx, ecx; + + eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx); + return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC; } static unsigned long vmware_get_tsc_khz(void) @@ -166,21 +184,12 @@ static void __init vmware_cyc2ns_setup(void) pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); } -static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo) { - uint32_t result, info; - - asm volatile (VMWARE_HYPERCALL : - "=a"(result), - "=c"(info) : - "a"(VMWARE_HYPERVISOR_MAGIC), - "b"(0), - "c"(VMWARE_CMD_STEALCLOCK), - "d"(0), - "S"(arg1), - "D"(arg2) : - "memory"); - return result; + u32 info; + + return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo, + &info); } static bool stealclock_enable(phys_addr_t pa) @@ -215,15 +224,15 @@ static bool vmware_is_stealclock_available(void) * Return: * The steal clock reading in ns. */ -static uint64_t vmware_steal_clock(int cpu) +static u64 vmware_steal_clock(int cpu) { struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); - uint64_t clock; + u64 clock; if (IS_ENABLED(CONFIG_64BIT)) clock = READ_ONCE(steal->clock); else { - uint32_t initial_high, low, high; + u32 initial_high, low, high; do { initial_high = READ_ONCE(steal->clock_high); @@ -235,7 +244,7 @@ static uint64_t vmware_steal_clock(int cpu) high = READ_ONCE(steal->clock_high); } while (initial_high != high); - clock = ((uint64_t)high << 32) | low; + clock = ((u64)high << 32) | low; } return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, @@ -389,13 +398,13 @@ static void __init vmware_set_capabilities(void) static void __init vmware_platform_setup(void) { - uint32_t eax, ebx, ecx, edx; - uint64_t lpj, tsc_khz; + u32 eax, ebx, ecx; + u64 lpj, tsc_khz; - VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); + eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx); if (ebx != UINT_MAX) { - lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + lpj = tsc_khz = eax | (((u64)ebx) << 32); do_div(tsc_khz, 1000); WARN_ON(tsc_khz >> 32); pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", @@ -421,6 +430,9 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT)) + x86_init.mpparse.find_mptable = mpparse_find_mptable; + vmware_paravirt_ops_setup(); #ifdef CONFIG_X86_IO_APIC @@ -446,7 +458,7 @@ static u8 __init vmware_select_hypercall(void) * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode * intentionally defaults to 0. */ -static uint32_t __init vmware_platform(void) +static u32 __init vmware_platform(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { unsigned int eax; @@ -474,11 +486,64 @@ static uint32_t __init vmware_platform(void) /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { - uint32_t eax, ebx, ecx, edx; - VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && - (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); + u32 eax; + + eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0); + return !(eax & GETVCPU_INFO_VCPU_RESERVED) && + (eax & GETVCPU_INFO_LEGACY_X2APIC); +} + +#ifdef CONFIG_INTEL_TDX_GUEST +/* + * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore, + * we remap those registers to %r12 and %r13, respectively. + */ +unsigned long vmware_tdx_hypercall(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5) +{ + struct tdx_module_args args = {}; + + if (!hypervisor_is_type(X86_HYPER_VMWARE)) { + pr_warn_once("Incorrect usage\n"); + return ULONG_MAX; + } + + if (cmd & ~VMWARE_CMD_MASK) { + pr_warn_once("Out of range command %lx\n", cmd); + return ULONG_MAX; + } + + args.rbx = in1; + args.rdx = in3; + args.rsi = in4; + args.rdi = in5; + args.r10 = VMWARE_TDX_VENDOR_LEAF; + args.r11 = VMWARE_TDX_HCALL_FUNC; + args.r12 = VMWARE_HYPERVISOR_MAGIC; + args.r13 = cmd; + /* CPL */ + args.r15 = 0; + + __tdx_hypercall(&args); + + if (out1) + *out1 = args.rbx; + if (out2) + *out2 = args.r13; + if (out3) + *out3 = args.rdx; + if (out4) + *out4 = args.rsi; + if (out5) + *out5 = args.rdi; + + return args.r12; } +EXPORT_SYMBOL_GPL(vmware_tdx_hypercall); +#endif #ifdef CONFIG_AMD_MEM_ENCRYPT static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 415564a6523b..90eba7eb5335 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -71,10 +71,6 @@ static void init_zhaoxin(struct cpuinfo_x86 *c) { early_init_zhaoxin(c); init_intel_cacheinfo(c); - detect_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif if (c->cpuid_level > 9) { unsigned int eax = cpuid_eax(10); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index c92d88680dbf..340af8155658 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,6 +26,7 @@ #include <linux/vmalloc.h> #include <linux/memblock.h> +#include <asm/bootparam.h> #include <asm/processor.h> #include <asm/hardirq.h> #include <asm/nmi.h> @@ -40,6 +41,7 @@ #include <asm/intel_pt.h> #include <asm/crash.h> #include <asm/cmdline.h> +#include <asm/sev.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -59,6 +61,8 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) */ cpu_emergency_stop_pt(); + kdump_sev_callback(); + disable_local_APIC(); } @@ -124,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif + + /* + * Non-crash kexec calls enc_kexec_begin() while scheduling is still + * active. This allows the callback to wait until all in-flight + * shared<->private conversions are complete. In a crash scenario, + * enc_kexec_begin() gets called after all but one CPU have been shut + * down and interrupts have been disabled. This allows the callback to + * detect a race with the conversion and report it. + */ + x86_platform.guest.enc_kexec_begin(); + x86_platform.guest.enc_kexec_finish(); + crash_save_cpu(regs, safe_smp_processor_id()); } @@ -170,7 +186,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) int ret = 0; /* Exclude the low 1M because it is always reserved */ - ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1); + ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1); if (ret) return ret; @@ -198,8 +214,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) } /* Prepare elf headers. Return addr and size */ -static int prepare_elf_headers(struct kimage *image, void **addr, - unsigned long *sz, unsigned long *nr_mem_ranges) +static int prepare_elf_headers(void **addr, unsigned long *sz, + unsigned long *nr_mem_ranges) { struct crash_mem *cmem; int ret; @@ -221,7 +237,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr, *nr_mem_ranges = cmem->nr_ranges; /* By default prepare 64bit headers */ - ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); out: vfree(cmem); @@ -349,7 +365,7 @@ int crash_load_segments(struct kimage *image) .buf_max = ULONG_MAX, .top_down = false }; /* Prepare elf headers and add a segment */ - ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum); + ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum); if (ret) return ret; @@ -386,8 +402,8 @@ int crash_load_segments(struct kimage *image) if (ret) return ret; image->elf_load_addr = kbuf.mem; - pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); return ret; } @@ -398,20 +414,26 @@ int crash_load_segments(struct kimage *image) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt -/* These functions provide the value for the sysfs crash_hotplug nodes */ -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void) +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) { - return crash_check_update_elfcorehdr(); -} -#endif -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void) -{ - return crash_check_update_elfcorehdr(); -} +#ifdef CONFIG_KEXEC_FILE + if (image->file_mode) + return 1; #endif + /* + * Initially, crash hotplug support for kexec_load was added + * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this + * functionality was expanded to accommodate multiple kexec + * segment updates, leading to the introduction of the + * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently, + * when the kexec tool sends either of these flags, it indicates + * that the required kexec segment (elfcorehdr) is excluded from + * the SHA calculation. + */ + return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR || + kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT); +} unsigned int arch_crash_get_elfcorehdr_size(void) { @@ -428,10 +450,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void) /** * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes * @image: a pointer to kexec_crash_image + * @arg: struct memory_notify handler for memory hotplug case and + * NULL for CPU hotplug case. * * Prepare the new elfcorehdr and replace the existing elfcorehdr. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *elfbuf = NULL, *old_elfcorehdr; unsigned long nr_mem_ranges; @@ -452,7 +476,7 @@ void arch_crash_handle_hotplug_event(struct kimage *image) * Create the new elfcorehdr reflecting the changes to CPU and/or * memory resources. */ - if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) { + if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) { pr_err("unable to create new elfcorehdr"); goto out; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index afd09924094e..dd8748c45529 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ /* * Architecture specific OF callbacks. */ +#include <linux/acpi.h> #include <linux/export.h> #include <linux/io.h> #include <linux/interrupt.h> @@ -24,6 +25,7 @@ #include <asm/pci_x86.h> #include <asm/setup.h> #include <asm/i8259.h> +#include <asm/numa.h> #include <asm/prom.h> __initdata u64 initial_dtb; @@ -82,7 +84,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev) ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (ret) - return ret; + return pcibios_err_to_errno(ret); if (!pin) return 0; @@ -136,7 +138,8 @@ static void __init dtb_cpu_setup(void) pr_warn("%pOF: missing local APIC ID\n", dn); continue; } - generic_processor_info(apic_id); + topology_register_apic(apic_id, CPU_ACPIID_INVALID, true); + set_apicid_to_node(apic_id, of_node_to_nid(dn)); } } @@ -277,36 +280,40 @@ static void __init dtb_apic_setup(void) dtb_ioapic_setup(); } -#ifdef CONFIG_OF_EARLY_FLATTREE +static void __init x86_dtb_parse_smp_config(void) +{ + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + void __init x86_flattree_get_config(void) { +#ifdef CONFIG_OF_EARLY_FLATTREE u32 size, map_len; void *dt; - if (!initial_dtb) - return; + if (initial_dtb) { + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); + if (map_len < size) { + early_memunmap(dt, map_len); + dt = early_memremap(initial_dtb, size); + map_len = size; + } - dt = early_memremap(initial_dtb, map_len); - size = fdt_totalsize(dt); - if (map_len < size) { - early_memunmap(dt, map_len); - dt = early_memremap(initial_dtb, size); - map_len = size; + early_init_dt_verify(dt, __pa(dt)); } - early_init_dt_verify(dt); unflatten_and_copy_device_tree(); - early_memunmap(dt, map_len); -} -#endif -void __init x86_dtb_init(void) -{ - if (!of_have_populated_dt()) - return; - - dtb_setup_hpet(); - dtb_apic_setup(); + if (initial_dtb) + early_memunmap(dt, map_len); +#endif + if (acpi_disabled && of_have_populated_dt()) + x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f18ca44c904b..a7d562697e50 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -405,12 +405,12 @@ static void __die_header(const char *str, struct pt_regs *regs, long err) pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - pr, + "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, + ++die_counter, pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", - IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ? (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); } NOKPROBE_SYMBOL(__die_header); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index fb8cf953380d..82b96ed9890a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -827,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) +static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) { int i; unsigned long last_pfn = 0; @@ -838,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type unsigned long start_pfn; unsigned long end_pfn; - if (entry->type != type) + if (entry->type != E820_TYPE_RAM && + entry->type != E820_TYPE_ACPI) continue; start_pfn = entry->addr >> PAGE_SHIFT; @@ -864,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); + return e820__end_ram_pfn(MAX_ARCH_PFN); } unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); + return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT)); } static void __init early_panic(char *msg) @@ -1016,15 +1018,6 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - /* - * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need - * to be reserved. - */ - if (data->type != SETUP_EFI && data->type != SETUP_IMA) - e820__range_update_kexec(pa_data, - sizeof(*data) + data->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT) { len += data->len; early_memunmap(data, sizeof(*data)); @@ -1036,12 +1029,9 @@ void __init e820__reserve_setup_data(void) indirect = (struct setup_indirect *)data->data; - if (indirect->type != SETUP_INDIRECT) { + if (indirect->type != SETUP_INDIRECT) e820__range_update(indirect->addr, indirect->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } } pa_data = pa_next; @@ -1049,7 +1039,6 @@ void __init e820__reserve_setup_data(void) } e820__update_table(e820_table); - e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); @@ -1157,11 +1146,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res) * e820_table->nr_entries); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a6c1867fc7aa..6b6f32f40cbe 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,8 +17,8 @@ #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> -#include <drm/i915_drm.h> -#include <drm/i915_pciids.h> +#include <drm/intel/i915_drm.h> +#include <drm/intel/pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -518,47 +518,46 @@ static const struct intel_early_ops gen11_early_ops __initconst = { /* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { - INTEL_I830_IDS(&i830_early_ops), - INTEL_I845G_IDS(&i845_early_ops), - INTEL_I85X_IDS(&i85x_early_ops), - INTEL_I865G_IDS(&i865_early_ops), - INTEL_I915G_IDS(&gen3_early_ops), - INTEL_I915GM_IDS(&gen3_early_ops), - INTEL_I945G_IDS(&gen3_early_ops), - INTEL_I945GM_IDS(&gen3_early_ops), - INTEL_VLV_IDS(&gen6_early_ops), - INTEL_PINEVIEW_G_IDS(&gen3_early_ops), - INTEL_PINEVIEW_M_IDS(&gen3_early_ops), - INTEL_I965G_IDS(&gen3_early_ops), - INTEL_G33_IDS(&gen3_early_ops), - INTEL_I965GM_IDS(&gen3_early_ops), - INTEL_GM45_IDS(&gen3_early_ops), - INTEL_G45_IDS(&gen3_early_ops), - INTEL_IRONLAKE_D_IDS(&gen3_early_ops), - INTEL_IRONLAKE_M_IDS(&gen3_early_ops), - INTEL_SNB_D_IDS(&gen6_early_ops), - INTEL_SNB_M_IDS(&gen6_early_ops), - INTEL_IVB_M_IDS(&gen6_early_ops), - INTEL_IVB_D_IDS(&gen6_early_ops), - INTEL_HSW_IDS(&gen6_early_ops), - INTEL_BDW_IDS(&gen8_early_ops), - INTEL_CHV_IDS(&chv_early_ops), - INTEL_SKL_IDS(&gen9_early_ops), - INTEL_BXT_IDS(&gen9_early_ops), - INTEL_KBL_IDS(&gen9_early_ops), - INTEL_CFL_IDS(&gen9_early_ops), - INTEL_GLK_IDS(&gen9_early_ops), - INTEL_CNL_IDS(&gen9_early_ops), - INTEL_ICL_11_IDS(&gen11_early_ops), - INTEL_EHL_IDS(&gen11_early_ops), - INTEL_JSL_IDS(&gen11_early_ops), - INTEL_TGL_12_IDS(&gen11_early_ops), - INTEL_RKL_IDS(&gen11_early_ops), - INTEL_ADLS_IDS(&gen11_early_ops), - INTEL_ADLP_IDS(&gen11_early_ops), - INTEL_ADLN_IDS(&gen11_early_ops), - INTEL_RPLS_IDS(&gen11_early_ops), - INTEL_RPLP_IDS(&gen11_early_ops), + INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops), + INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops), + INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops), + INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops), + INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops), + INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops), + INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); @@ -779,13 +778,13 @@ static int __init check_dev_quirk(int num, int slot, int func) type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); - if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) { + if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) { sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS); if (sec > num) early_pci_scan_bus(sec); } - if (!(type & 0x80)) + if (!(type & PCI_HEADER_TYPE_MFD)) return -1; return 0; diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index e963344b0449..9535a6507db7 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -2,6 +2,7 @@ /* * EISA specific code */ +#include <linux/cc_platform.h> #include <linux/ioport.h> #include <linux/eisa.h> #include <linux/io.h> @@ -10,15 +11,15 @@ static __init int eisa_bus_probe(void) { - void __iomem *p; + u32 *p; - if (xen_pv_domain() && !xen_initial_domain()) + if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return 0; - p = ioremap(0x0FFFD9, 4); - if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) + p = memremap(0x0FFFD9, 4, MEMREMAP_WB); + if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; - iounmap(p); + memunmap(p); return 0; } subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 16f9814c9be0..6726e0473d0b 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -106,6 +106,10 @@ void __init init_espfix_bsp(void) pgd_t *pgd; p4d_t *p4d; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* Install the espfix pud into the kernel page directory */ pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)]; p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); @@ -129,6 +133,10 @@ void init_espfix_ap(int cpu) void *stack_page; pteval_t ptemask; + /* FRED systems always restore the full value of %rsp */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) + return; + /* We only have to do this once... */ if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index 794e70151203..edbafc5940e3 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -2,6 +2,9 @@ /* * x86 FPU bug checks: */ +#include <linux/printk.h> + +#include <asm/cpufeature.h> #include <asm/fpu/api.h> /* diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a21a4d0ecc34..1209c7aebb21 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -145,8 +145,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) asm volatile( "fnclex\n\t" "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpstate)); + "fildl %[addr]" /* set F?P to defined value */ + : : [addr] "m" (*fpstate)); } if (use_xsave()) { @@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the - * same as the state on VMENTER. So software state has to be udpated before + * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6bc1eb2a21bd..887b0b8e21e3 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -190,7 +190,8 @@ int ssp_get(struct task_struct *target, const struct user_regset *regset, struct fpu *fpu = &target->thread.fpu; struct cet_user_state *cetregs; - if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) return -ENODEV; sync_fpstate(fpu); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 558076dbde5b..8f62e0666dea 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -156,10 +156,11 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, return !err; } -static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { if (use_xsave()) - return xsave_to_user_sigframe(buf); + return xsave_to_user_sigframe(buf, pkru); + if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else @@ -185,7 +186,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; struct fpstate *fpstate = tsk->thread.fpu.fpstate; @@ -228,7 +229,7 @@ retry: fpregs_restore_userregs(); pagefault_disable(); - ret = copy_fpregs_to_sigframe(buf_fx); + ret = copy_fpregs_to_sigframe(buf_fx, pkru); pagefault_enable(); fpregs_unlock(); @@ -274,12 +275,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, * Attempt to restore the FPU registers directly from user memory. * Pagefaults are handled and any errors returned are fatal. */ -static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, - bool fx_only, unsigned int size) +static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { struct fpu *fpu = ¤t->thread.fpu; int ret; + /* Restore enabled features only. */ + xrestore &= fpu->fpstate->user_xfeatures; retry: fpregs_lock(); /* Ensure that XFD is up to date */ @@ -309,7 +311,7 @@ retry: if (ret != X86_TRAP_PF) return false; - if (!fault_in_readable(buf, size)) + if (!fault_in_readable(buf, fpu->fpstate->user_size)) goto retry; return false; } @@ -339,7 +341,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; - unsigned int state_size; u64 user_xfeatures = 0; if (use_xsave()) { @@ -349,17 +350,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, return false; fx_only = !fx_sw_user.magic1; - state_size = fx_sw_user.xstate_size; user_xfeatures = fx_sw_user.xfeatures; } else { user_xfeatures = XFEATURE_MASK_FPSSE; - state_size = fpu->fpstate->user_size; } if (likely(!ia32_fxstate)) { /* Restore the FPU registers directly from user memory. */ - return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only, - state_size); + return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 117e74c44e75..27417b685c1d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -13,16 +13,20 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> +#include <linux/coredump.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> +#include <asm/cpuid.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> +#include <uapi/asm/elf.h> + #include "context.h" #include "internal.h" #include "legacy.h" @@ -178,10 +182,11 @@ void fpu__init_cpu_xstate(void) * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures - * that any stale state is wiped out from XFD. + * that any stale state is wiped out from XFD. Reset the per CPU + * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) - wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features @@ -228,7 +233,7 @@ static void __init setup_xstate_cache(void) xmm_space); for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; xstate_flags[i] = ecx; @@ -394,7 +399,7 @@ int xfeature_size(int xfeature_nr) u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } @@ -437,9 +442,9 @@ static void __init __xstate_dump_leaves(void) * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", - XSTATE_CPUID, i, eax, ebx, ecx, edx); + CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } @@ -480,7 +485,7 @@ static int __init check_xtile_data_against_struct(int size) * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ - cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of @@ -494,7 +499,7 @@ static int __init check_xtile_data_against_struct(int size) * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ - cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; @@ -629,7 +634,7 @@ static unsigned int __init get_compacted_size(void) * are no supervisor states, but XSAVEC still uses compacted * format. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } @@ -670,7 +675,7 @@ static unsigned int __init get_xsave_size_user(void) * containing all the *user* state components * corresponding to bits currently set in XCR0. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } @@ -759,21 +764,16 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) return; } - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN_ON_FPU(1); - return; - } - /* * Find user xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { @@ -787,6 +787,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ @@ -990,6 +993,20 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } +EXPORT_SYMBOL_GPL(get_xsave_addr); + +/* + * Given an xstate feature nr, calculate where in the xsave buffer the state is. + * The xsave buffer should be in standard format, not compacted (e.g. user mode + * signal frames). + */ +void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) +{ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + return (void __user *)xsave + xstate_offsets[xfeature_nr]; +} #ifdef CONFIG_ARCH_HAS_PKEYS @@ -1433,8 +1450,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) return rstor; /* - * XSAVE(S): clone(), fpu_swap_kvm_fpu() - * XRSTORS(S): fpu_swap_kvm_fpu() + * XSAVE(S): clone(), fpu_swap_kvm_fpstate() + * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* @@ -1836,3 +1853,89 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ + +#ifdef CONFIG_COREDUMP +static const char owner_name[] = "LINUX"; + +/* + * Dump type, size, offset and flag values for every xfeature that is present. + */ +static int dump_xsave_layout_desc(struct coredump_params *cprm) +{ + int num_records = 0; + int i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) { + struct x86_xfeat_component xc = { + .type = i, + .size = xstate_sizes[i], + .offset = xstate_offsets[i], + /* reserved for future use */ + .flags = 0, + }; + + if (!dump_emit(cprm, &xc, sizeof(xc))) + return 0; + + num_records++; + } + return num_records; +} + +static u32 get_xsave_desc_size(void) +{ + u32 cnt = 0; + u32 i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) + cnt++; + + return cnt * (sizeof(struct x86_xfeat_component)); +} + +int elf_coredump_extra_notes_write(struct coredump_params *cprm) +{ + int num_records = 0; + struct elf_note en; + + if (!fpu_user_cfg.max_features) + return 0; + + en.n_namesz = sizeof(owner_name); + en.n_descsz = get_xsave_desc_size(); + en.n_type = NT_X86_XSAVE_LAYOUT; + + if (!dump_emit(cprm, &en, sizeof(en))) + return 1; + if (!dump_emit(cprm, owner_name, en.n_namesz)) + return 1; + if (!dump_align(cprm, 4)) + return 1; + + num_records = dump_xsave_layout_desc(cprm); + if (!num_records) + return 1; + + /* Total size should be equal to the number of records */ + if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) + return 1; + + return 0; +} + +int elf_coredump_extra_notes_size(void) +{ + int size; + + if (!fpu_user_cfg.max_features) + return 0; + + /* .note header */ + size = sizeof(struct elf_note); + /* Name plus alignment to 4 bytes */ + size += roundup(sizeof(owner_name), 4); + size += get_xsave_desc_size(); + + return size; +} +#endif /* CONFIG_COREDUMP */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 3518fb26d06b..aa16f1a1bbcf 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -54,7 +54,7 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); -extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); +extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); static inline u64 xfeatures_mask_supervisor(void) { @@ -64,9 +64,31 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; +} + +/* + * Update the value of PKRU register that was already pushed onto the signal frame. + */ +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) +{ + u64 xstate_bv; + int err; + + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) + return 0; + + /* Mark PKRU as in-use so that it is restored correctly. */ + xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; + + err = __put_user(xstate_bv, &buf->header.xfeatures); + if (err) + return err; + + /* Update PKRU value in the userspace xsave buffer. */ + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); } /* XSAVE/XRSTOR wrapper functions */ @@ -108,21 +130,17 @@ static inline u64 xfeatures_mask_independent(void) * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * - * We use XSAVE as a fallback. - * - * The 661 label is defined in the ALTERNATIVE* macros as the address of the - * original instruction which gets replaced. We need to use it here as the - * address of the instruction where we might get an exception at. + * Use XSAVE as a fallback. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_3(XSAVE, \ + asm volatile("1: " ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ - _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ + _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -132,11 +150,11 @@ static inline u64 xfeatures_mask_independent(void) * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ - asm volatile(ALTERNATIVE(XRSTOR, \ + asm volatile("1: " ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ - _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") @@ -148,20 +166,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #endif #ifdef CONFIG_X86_64 +static inline void xfd_set_state(u64 xfd) +{ + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); +} + static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; - if (__this_cpu_read(xfd_state) != xfd) { - wrmsrl(MSR_IA32_XFD, xfd); - __this_cpu_write(xfd_state, xfd); - } + if (__this_cpu_read(xfd_state) != xfd) + xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else +static inline void xfd_set_state(u64 xfd) { } + static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { @@ -254,7 +278,7 @@ static inline u64 xfeatures_need_sigframe_write(void) * The caller has to zero buf::header before calling this because XSAVE* * does not touch the reserved fields in the header. */ -static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru) { /* * Include the features which are not xsaved/rstored by the kernel @@ -279,6 +303,9 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); + if (!err) + err = update_pkru_in_sigframe(buf, mask, pkru); + return err; } diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c new file mode 100644 index 000000000000..5e2cd1004980 --- /dev/null +++ b/arch/x86/kernel/fred.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> + +#include <asm/desc.h> +#include <asm/fred.h> +#include <asm/tlbflush.h> +#include <asm/traps.h> + +/* #DB in the kernel would imply the use of a kernel debugger. */ +#define FRED_DB_STACK_LEVEL 1UL +#define FRED_NMI_STACK_LEVEL 2UL +#define FRED_MC_STACK_LEVEL 2UL +/* + * #DF is the highest level because a #DF means "something went wrong + * *while delivering an exception*." The number of cases for which that + * can happen with FRED is drastically reduced and basically amounts to + * "the stack you pointed me to is broken." Thus, always change stacks + * on #DF, which means it should be at the highest level. + */ +#define FRED_DF_STACK_LEVEL 3UL + +#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) + +DEFINE_PER_CPU(unsigned long, fred_rsp0); +EXPORT_PER_CPU_SYMBOL(fred_rsp0); + +void cpu_init_fred_exceptions(void) +{ + /* When FRED is enabled by default, remove this log message */ + pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); + + /* + * If a kernel event is delivered before a CPU goes to user level for + * the first time, its SS is NULL thus NULL is pushed into the SS field + * of the FRED stack frame. But before ERETS is executed, the CPU may + * context switch to another task and go to user level. Then when the + * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later + * when ERETS is executed to return from the kernel event handler, a #GP + * fault is generated because SS doesn't match the SS saved in the FRED + * stack frame. + * + * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs. + */ + loadsegment(ss, __KERNEL_DS); + + wrmsrl(MSR_IA32_FRED_CONFIG, + /* Reserve for CALL emulation */ + FRED_CONFIG_REDZONE | + FRED_CONFIG_INT_STKLVL(0) | + FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + + wrmsrl(MSR_IA32_FRED_STKLVLS, 0); + + /* + * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be + * resynchronized with its per-CPU cache. + */ + wrmsrl(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); + + wrmsrl(MSR_IA32_FRED_RSP1, 0); + wrmsrl(MSR_IA32_FRED_RSP2, 0); + wrmsrl(MSR_IA32_FRED_RSP3, 0); + + /* Enable FRED */ + cr4_set_bits(X86_CR4_FRED); + /* Any further IDT use is a bug */ + idt_invalidate(); + + /* Use int $0x80 for 32-bit system calls in FRED mode */ + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +} + +/* Must be called after setup_cpu_entry_areas() */ +void cpu_init_fred_rsps(void) +{ + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrl(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 12df54ff0e81..166bc0ea3bdf 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -25,6 +25,7 @@ #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include <trace/syscall.h> @@ -117,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -260,25 +264,14 @@ void arch_ftrace_update_code(int command) /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 -#ifdef CONFIG_MODULES -#include <linux/moduleloader.h> -/* Module allocation simplifies allocating memory for code */ static inline void *alloc_tramp(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { - module_memfree(tramp); + execmem_free(tramp); } -#else -/* Trampolines can only be created if modules are supported */ -static inline void *alloc_tramp(unsigned long size) -{ - return NULL; -} -static inline void tramp_free(void *tramp) { } -#endif /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); @@ -307,7 +300,8 @@ union ftrace_op_code_union { } __attribute__((packed)); }; -#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS)) +#define RET_SIZE \ + (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS)) static unsigned long create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) @@ -327,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - int ret; + void *ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -358,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); - if (WARN_ON(ret < 0)) + ret = text_poke_copy(trampoline, (void *)start_offset, size); + if (WARN_ON(!ret)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - memcpy(ip, retq, sizeof(retq)); + text_poke_copy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -374,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); - if (ret < 0) + if (!text_poke_copy(ip, x86_nops[2], 2)) goto fail; } @@ -388,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - *ptr = (unsigned long)ops; + text_poke_copy(ptr, &ops, sizeof(unsigned long)); op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -404,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -414,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE); + text_poke_copy_locked(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE, false); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ @@ -614,16 +607,8 @@ int ftrace_disable_ftrace_graph_caller(void) } #endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -/* - * Hook the return address and push it in the stack of return addrs - * in current thread info. - */ -void prepare_ftrace_return(unsigned long ip, unsigned long *parent, - unsigned long frame_pointer) +static inline bool skip_ftrace_return(void) { - unsigned long return_hooker = (unsigned long)&return_to_handler; - int bit; - /* * When resuming from suspend-to-ram, this function can be indirectly * called from early CPU startup code while the CPU is in real mode, @@ -633,33 +618,48 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, * This check isn't as accurate as virt_addr_valid(), but it should be * good enough for this purpose, and it's fast. */ - if (unlikely((long)__builtin_frame_address(0) >= 0)) - return; + if ((long)__builtin_frame_address(0) >= 0) + return true; - if (unlikely(ftrace_graph_is_dead())) - return; + if (ftrace_graph_is_dead()) + return true; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) - return; + if (atomic_read(¤t->tracing_graph_pause)) + return true; + return false; +} - bit = ftrace_test_recursion_trylock(ip, *parent); - if (bit < 0) +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + + if (unlikely(skip_ftrace_return())) return; if (!function_graph_enter(*parent, ip, frame_pointer, parent)) *parent = return_hooker; - - ftrace_test_recursion_unlock(bit); } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long *parent = (unsigned long *)stack; - prepare_ftrace_return(ip, (unsigned long *)stack, 0); + if (unlikely(skip_ftrace_return())) + return; + + + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = return_hooker; } #endif diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 58d9ed50fe61..f4e0c3361234 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -187,14 +187,15 @@ SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 - pushl %edx - pushl %eax + subl $(PTREGS_SIZE), %esp + movl $0, PT_EBP(%esp) + movl %edx, PT_EDX(%esp) + movl %eax, PT_EAX(%esp) movl %esp, %eax call ftrace_return_to_handler movl %eax, %ecx - popl %eax - popl %edx - addl $4, %esp # skip ebp + movl PT_EAX(%esp), %eax + movl PT_EDX(%esp), %edx + addl $(PTREGS_SIZE), %esp JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 214f30e9f0c0..d51647228596 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -348,21 +348,22 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__) SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - subq $24, %rsp - /* Save the return values */ - movq %rax, (%rsp) - movq %rdx, 8(%rsp) - movq %rbp, 16(%rsp) + /* Save ftrace_regs for function exit context */ + subq $(FRAME_SIZE), %rsp + + movq %rax, RAX(%rsp) + movq %rdx, RDX(%rsp) + movq %rbp, RBP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler movq %rax, %rdi - movq 8(%rsp), %rdx - movq (%rsp), %rax + movq RDX(%rsp), %rdx + movq RAX(%rsp), %rax - addq $24, %rsp + addq $(FRAME_SIZE), %rsp /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 05a110c97111..22c9ba305ac1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -22,6 +22,8 @@ #include <linux/cc_platform.h> #include <linux/pgtable.h> +#include <asm/asm.h> +#include <asm/page_64.h> #include <asm/processor.h> #include <asm/proto.h> #include <asm/smp.h> @@ -67,42 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); #endif -/* - * GDT used on the boot CPU before switching to virtual addresses. - */ -static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = { - [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), - [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), -}; - -/* - * Address needs to be set at runtime because it references the startup_gdt - * while the kernel still uses a direct mapping. - */ -static struct desc_ptr startup_gdt_descr __initdata = { - .size = sizeof(startup_gdt)-1, - .address = 0, -}; - -static void __head *fixup_pointer(void *ptr, unsigned long physaddr) +static inline bool check_la57_support(void) { - return ptr - (void *)_text + (void *)physaddr; -} - -static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr) -{ - return fixup_pointer(ptr, physaddr); -} - -#ifdef CONFIG_X86_5LEVEL -static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr) -{ - return fixup_pointer(ptr, physaddr); -} + if (!IS_ENABLED(CONFIG_X86_5LEVEL)) + return false; -static bool __head check_la57_support(unsigned long physaddr) -{ /* * 5-level paging is detected and enabled at kernel decompression * stage. Only check if it has been enabled there. @@ -110,25 +81,21 @@ static bool __head check_la57_support(unsigned long physaddr) if (!(native_read_cr4() & X86_CR4_LA57)) return false; - *fixup_int(&__pgtable_l5_enabled, physaddr) = 1; - *fixup_int(&pgdir_shift, physaddr) = 48; - *fixup_int(&ptrs_per_p4d, physaddr) = 512; - *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5; - *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5; - *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5; + RIP_REL_REF(__pgtable_l5_enabled) = 1; + RIP_REL_REF(pgdir_shift) = 48; + RIP_REL_REF(ptrs_per_p4d) = 512; + RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; + RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; + RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; return true; } -#else -static bool __head check_la57_support(unsigned long physaddr) -{ - return false; -} -#endif -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) +static unsigned long __head sme_postprocess_startup(struct boot_params *bp, + pmdval_t *pmd, + unsigned long p2v_offset) { - unsigned long vaddr, vaddr_end; + unsigned long paddr, paddr_end; int i; /* Encrypt the kernel and related (if SME is active) */ @@ -141,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * attribute. */ if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; + paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); + paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + for (; paddr < paddr_end; paddr += PMD_SIZE) { /* * On SNP, transition the page to shared in the RMP table so that * it is consistent with the page table attribute change. @@ -153,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * mapping (kernel .text). PVALIDATE, by way of * early_snp_set_memory_shared(), requires a valid virtual * address but the kernel is currently running off of the identity - * mapping so use __pa() to get a *currently* valid virtual address. + * mapping so use the PA to get a *currently* valid virtual address. */ - early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); + early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - i = pmd_index(vaddr); + i = pmd_index(paddr - p2v_offset); pmd[i] -= sme_get_me_mask(); } } @@ -173,23 +140,25 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv * doesn't have to generate PC-relative relocations when accessing globals from * that function. Clang actually does not generate them, which leads to * boot-time crashes. To work around this problem, every global pointer must - * be adjusted using fixup_pointer(). + * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined + * by subtracting p2v_offset from the RIP-relative address. */ -unsigned long __head __startup_64(unsigned long physaddr, +unsigned long __head __startup_64(unsigned long p2v_offset, struct boot_params *bp) { - unsigned long load_delta, *p; + pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); + unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); + unsigned long va_text, va_end; unsigned long pgtable_flags; + unsigned long load_delta; pgdval_t *pgd; p4dval_t *p4d; pudval_t *pud; pmdval_t *pmd, pmd_entry; - pteval_t *mask_ptr; bool la57; int i; - unsigned int *next_pgt_ptr; - la57 = check_la57_support(physaddr); + la57 = check_la57_support(); /* Is the address too large? */ if (physaddr >> MAX_PHYSMEM_BITS) @@ -199,37 +168,36 @@ unsigned long __head __startup_64(unsigned long physaddr, * Compute the delta between the address I am compiled to run at * and the address I am actually running at. */ - load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); + load_delta = __START_KERNEL_map + p2v_offset; + RIP_REL_REF(phys_base) = load_delta; /* Is the address not 2M aligned? */ if (load_delta & ~PMD_MASK) for (;;); + va_text = physaddr - p2v_offset; + va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; + /* Include the SME encryption mask in the fixup value */ load_delta += sme_get_me_mask(); /* Fixup the physical addresses in the page table */ - pgd = fixup_pointer(early_top_pgt, physaddr); - p = pgd + pgd_index(__START_KERNEL_map); - if (la57) - *p = (unsigned long)level4_kernel_pgt; - else - *p = (unsigned long)level3_kernel_pgt; - *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta; + pgd = &RIP_REL_REF(early_top_pgt)->pgd; + pgd[pgd_index(__START_KERNEL_map)] += load_delta; - if (la57) { - p4d = fixup_pointer(level4_kernel_pgt, physaddr); - p4d[511] += load_delta; + if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { + p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); + p4d[MAX_PTRS_PER_P4D - 1] += load_delta; + + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; } - pud = fixup_pointer(level3_kernel_pgt, physaddr); - pud[510] += load_delta; - pud[511] += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; + RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - pmd = fixup_pointer(level2_fixmap_pgt, physaddr); for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - pmd[i] += load_delta; + RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; /* * Set up the identity mapping for the switchover. These @@ -238,15 +206,14 @@ unsigned long __head __startup_64(unsigned long physaddr, * it avoids problems around wraparound. */ - next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); - pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); - pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); + pud = &early_pgts[0]->pmd; + pmd = &early_pgts[1]->pmd; + RIP_REL_REF(next_early_pgt) = 2; pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (la57) { - p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], - physaddr); + p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; @@ -267,12 +234,11 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; /* Filter out unsupported __PAGE_KERNEL_* bits: */ - mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr); - pmd_entry &= *mask_ptr; + pmd_entry &= RIP_REL_REF(__supported_pte_mask); pmd_entry += sme_get_me_mask(); pmd_entry += physaddr; - for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { + for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { int idx = i + (physaddr >> PMD_SHIFT); pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; @@ -294,14 +260,14 @@ unsigned long __head __startup_64(unsigned long physaddr, * error, causing the BIOS to halt the system. */ - pmd = fixup_pointer(level2_kernel_pgt, physaddr); + pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index((unsigned long)_text); i++) + for (i = 0; i < pmd_index(va_text); i++) pmd[i] &= ~_PAGE_PRESENT; /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index((unsigned long)_end); i++) + for (; i <= pmd_index(va_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; @@ -309,13 +275,7 @@ unsigned long __head __startup_64(unsigned long physaddr, for (; i < PTRS_PER_PMD; i++) pmd[i] &= ~_PAGE_PRESENT; - /* - * Fixup phys_base - remove the memory encryption mask to obtain - * the true physical address. - */ - *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask(); - - return sme_postprocess_startup(bp, pmd); + return sme_postprocess_startup(bp, pmd, p2v_offset); } /* Wipe all early page tables except for the kernel symbol map */ @@ -569,62 +529,53 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) */ static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; -static struct desc_ptr bringup_idt_descr = { - .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1, - .address = 0, /* Set at runtime */ -}; - -static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler) +/* This may run while still in the direct mapping */ +static void __head startup_64_load_idt(void *vc_handler) { -#ifdef CONFIG_AMD_MEM_ENCRYPT + struct desc_ptr desc = { + .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), + .size = sizeof(bringup_idt_table) - 1, + }; struct idt_data data; - gate_desc desc; - - init_idt_data(&data, n, handler); - idt_init_desc(&desc, &data); - native_write_idt_entry(idt, n, &desc); -#endif -} + gate_desc idt_desc; -/* This runs while still in the direct mapping */ -static void __head startup_64_load_idt(unsigned long physbase) -{ - struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase); - gate_desc *idt = fixup_pointer(bringup_idt_table, physbase); - - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - void *handler; - - /* VMM Communication Exception */ - handler = fixup_pointer(vc_no_ghcb, physbase); - set_bringup_idt_handler(idt, X86_TRAP_VC, handler); + /* @vc_handler is set only for a VMM Communication Exception */ + if (vc_handler) { + init_idt_data(&data, X86_TRAP_VC, vc_handler); + idt_init_desc(&idt_desc, &data); + native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); } - desc->address = (unsigned long)idt; - native_load_idt(desc); + native_load_idt(&desc); } /* This is used when running on kernel addresses */ void early_setup_idt(void) { - /* VMM Communication Exception */ + void *handler = NULL; + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { setup_ghcb(); - set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + handler = vc_boot_ghcb; } - bringup_idt_descr.address = (unsigned long)bringup_idt_table; - native_load_idt(&bringup_idt_descr); + startup_64_load_idt(handler); } /* * Setup boot CPU state needed before kernel switches to virtual addresses. */ -void __head startup_64_setup_env(unsigned long physbase) +void __head startup_64_setup_gdt_idt(void) { + struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt); + void *handler = NULL; + + struct desc_ptr startup_gdt_descr = { + .address = (unsigned long)&RIP_REL_REF(*gdt), + .size = GDT_SIZE - 1, + }; + /* Load GDT */ - startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase); native_load_gdt(&startup_gdt_descr); /* New GDT is live - reload data segment registers */ @@ -632,5 +583,8 @@ void __head startup_64_setup_env(unsigned long physbase) "movl %%eax, %%ss\n" "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - startup_64_load_idt(physbase); + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + handler = &RIP_REL_REF(vc_no_ghcb); + + startup_64_load_idt(handler); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 487ac57e2c81..2e42056d2306 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -44,9 +44,6 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id - -#define SIZEOF_PTREGS 17*4 - /* * Worst-case size of the kernel mapping we need to make: * a relocatable kernel can live anywhere in lowmem, so we need to be able @@ -414,7 +411,7 @@ __REFDATA .align 4 SYM_DATA(initial_code, .long i386_start_kernel) -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION #define PGD_ALIGN (2 * PAGE_SIZE) #define PTI_USER_PGD_FILL 1024 #else @@ -474,7 +471,7 @@ SYM_DATA_START(initial_page_table) # endif .align PAGE_SIZE /* needs to be page-sized too */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * PTI needs another page so sync_initial_pagetable() works correctly * and does not scribble over the data which is placed behind the @@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table) .data .balign 4 -/* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder - * reliably detect the end of the stack. - */ -SYM_DATA(initial_stack, - .long init_thread_union + THREAD_SIZE - - SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) +SYM_DATA(initial_stack, .long __top_init_kernel_stack) __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" /* * The IDT and GDT 'descriptors' are a strange 48-bit object diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 0f8103240fda..31345e0ba006 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -26,20 +26,13 @@ #include <asm/apicdef.h> #include <asm/fixmap.h> #include <asm/smp.h> +#include <asm/thread_info.h> /* * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. */ -#define l4_index(x) (((x) >> 39) & 511) -#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) -L4_START_KERNEL = l4_index(__START_KERNEL_map) - -L3_START_KERNEL = pud_index(__START_KERNEL_map) - - .text __HEAD .code64 SYM_CODE_START_NOALIGN(startup_64) @@ -66,9 +59,7 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp - - leaq _text(%rip), %rdi + leaq __top_init_kernel_stack(%rip), %rsp /* Setup GSBASE to allow stack canary access for C code */ movl $MSR_GS_BASE, %ecx @@ -77,7 +68,7 @@ SYM_CODE_START_NOALIGN(startup_64) shrq $32, %rdx wrmsr - call startup_64_setup_env + call startup_64_setup_gdt_idt /* Now switch to __KERNEL_CS so IRET works reliably */ pushq $__KERNEL_CS @@ -86,6 +77,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: + ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -103,20 +95,52 @@ SYM_CODE_START_NOALIGN(startup_64) call verify_cpu /* + * Derive the kernel's physical-to-virtual offset from the physical and + * virtual addresses of common_startup_64(). + */ + leaq common_startup_64(%rip), %rdi + subq .Lcommon_startup_64(%rip), %rdi + + /* * Perform pagetable fixups. Additionally, if SME is active, encrypt * the kernel and retrieve the modifier (SME encryption mask if SME * is active) to be added to the initial pgdir entry that will be * programmed into CR3. */ - leaq _text(%rip), %rdi movq %r15, %rsi call __startup_64 /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(early_top_pgt - __START_KERNEL_map), %rax - jmp 1f + leaq early_top_pgt(%rip), %rcx + addq %rcx, %rax + +#ifdef CONFIG_AMD_MEM_ENCRYPT + mov %rax, %rdi + + /* + * For SEV guests: Verify that the C-bit is correct. A malicious + * hypervisor could lie about the C-bit position to perform a ROP + * attack on the guest by writing to the unencrypted stack and wait for + * the next RET instruction. + */ + call sev_verify_cbit +#endif + + /* + * Switch to early_top_pgt which still has the identity mappings + * present. + */ + movq %rax, %cr3 + + /* Branch to the common startup code at its kernel virtual address */ + ANNOTATE_RETPOLINE_SAFE + jmp *.Lcommon_startup_64(%rip) SYM_CODE_END(startup_64) + __INITRODATA +SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64) + + .text SYM_CODE_START(secondary_startup_64) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR @@ -149,22 +173,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Clear %R15 which holds the boot_params pointer on the boot CPU */ - xorq %r15, %r15 + xorl %r15d, %r15d + + /* Derive the runtime physical address of init_top_pgt[] */ + movq phys_base(%rip), %rax + addq $(init_top_pgt - __START_KERNEL_map), %rax /* * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ #ifdef CONFIG_AMD_MEM_ENCRYPT - movq sme_me_mask, %rax -#else - xorq %rax, %rax + addq sme_me_mask(%rip), %rax #endif + /* + * Switch to the init_top_pgt here, away from the trampoline_pgd and + * unmap the identity mapped ranges. + */ + movq %rax, %cr3 - /* Form the CR3 value being sure to include the CR3 modifier */ - addq $(init_top_pgt - __START_KERNEL_map), %rax -1: +SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) + UNWIND_HINT_END_OF_STACK + ANNOTATE_NOENDBR + /* + * Create a mask of CR4 bits to preserve. Omit PGE in order to flush + * global 1:1 translations from the TLBs. + * + * From the SDM: + * "If CR4.PGE is changing from 0 to 1, there were no global TLB + * entries before the execution; if CR4.PGE is changing from 1 to 0, + * there will be no global TLB entries after the execution." + */ + movl $(X86_CR4_PAE | X86_CR4_LA57), %edx #ifdef CONFIG_X86_MCE /* * Preserve CR4.MCE if the kernel will enable #MC support. @@ -173,61 +214,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * configured will crash the system regardless of the CR4.MCE value set * here. */ - movq %cr4, %rcx - andl $X86_CR4_MCE, %ecx -#else - movl $0, %ecx + orl $X86_CR4_MCE, %edx #endif + movq %cr4, %rcx + andl %edx, %ecx - /* Enable PAE mode, PSE, PGE and LA57 */ - orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx -#ifdef CONFIG_X86_5LEVEL - testl $1, __pgtable_l5_enabled(%rip) - jz 1f - orl $X86_CR4_LA57, %ecx -1: -#endif + /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */ + btsl $X86_CR4_PSE_BIT, %ecx movq %rcx, %cr4 - /* Setup early boot stage 4-/5-level pagetables. */ - addq phys_base(%rip), %rax - - /* - * For SEV guests: Verify that the C-bit is correct. A malicious - * hypervisor could lie about the C-bit position to perform a ROP - * attack on the guest by writing to the unencrypted stack and wait for - * the next RET instruction. - */ - movq %rax, %rdi - call sev_verify_cbit - - /* - * Switch to new page-table - * - * For the boot CPU this switches to early_top_pgt which still has the - * indentity mappings present. The secondary CPUs will switch to the - * init_top_pgt here, away from the trampoline_pgd and unmap the - * indentity mapped ranges. - */ - movq %rax, %cr3 - /* - * Do a global TLB flush after the CR3 switch to make sure the TLB - * entries from the identity mapping are flushed. + * Set CR4.PGE to re-enable global translations. */ - movq %cr4, %rcx - movq %rcx, %rax - xorq $X86_CR4_PGE, %rcx + btsl $X86_CR4_PGE_BIT, %ecx movq %rcx, %cr4 - movq %rax, %cr4 - - /* Ensure I am executing from virtual addresses */ - movq $1f, %rax - ANNOTATE_RETPOLINE_SAFE - jmp *%rax -1: - UNWIND_HINT_END_OF_STACK - ANNOTATE_NOENDBR // above #ifdef CONFIG_SMP /* @@ -284,7 +284,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) .Llookup_AP: /* EAX contains the APIC ID of the current CPU */ - xorq %rcx, %rcx + xorl %ecx, %ecx leaq cpuid_to_apicid(%rip), %rbx .Lfind_cpunr: @@ -415,39 +415,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) movq %r15, %rdi .Ljump_to_C_code: - /* - * Jump to run C code and to be on a real kernel address. - * Since we are running on identity-mapped space we have to jump - * to the full 64bit address, this is only possible as indirect - * jump. In addition we need to ensure %cs is set so we make this - * a far return. - * - * Note: do not change to far jump indirect with 64bit offset. - * - * AMD does not support far jump indirect with 64bit offset. - * AMD64 Architecture Programmer's Manual, Volume 3: states only - * JMP FAR mem16:16 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * JMP FAR mem16:32 FF /5 Far jump indirect, - * with the target specified by a far pointer in memory. - * - * Intel64 does support 64bit offset. - * Software Developer Manual Vol 2: states: - * FF /5 JMP m16:16 Jump far, absolute indirect, - * address given in m16:16 - * FF /5 JMP m16:32 Jump far, absolute indirect, - * address given in m16:32. - * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, - * address given in m16:64. - */ - pushq $.Lafter_lret # put return address on stack for unwinder xorl %ebp, %ebp # clear frame pointer - movq initial_code(%rip), %rax - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq -.Lafter_lret: - ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + callq *initial_code(%rip) + ud2 SYM_CODE_END(secondary_startup_64) #include "verify_cpu.S" @@ -464,7 +435,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx + movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code @@ -606,10 +577,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) SYM_CODE_END(vc_no_ghcb) #endif -#define SYM_DATA_START_PAGE_ALIGNED(name) \ - SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) - -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not * ever go out to userspace with these, so we do not @@ -630,19 +598,12 @@ SYM_CODE_END(vc_no_ghcb) #define PTI_USER_PGD_FILL 0 #endif -/* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ - i = i + 1 ; \ - .endr - __INITDATA .balign 4 SYM_DATA_START_PTI_ALIGNED(early_top_pgt) - .fill 512,8,0 + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) @@ -736,8 +697,6 @@ SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .endr SYM_DATA_END(level1_fixmap_pgt) -#undef PMDS - .data .align 16 @@ -748,7 +707,7 @@ SYM_DATA(smpboot_control, .long 0) SYM_DATA(phys_base, .quad 0x0) EXPORT_SYMBOL(phys_base) -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" __PAGE_ALIGNED_BSS SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 41eecf180b7f..7f4b2966e15c 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include <linux/cpu.h> #include <linux/irq.h> +#include <asm/cpuid.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> @@ -516,22 +517,14 @@ static int hpet_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, handle_edge_irq, arg->data, "edge"); return 0; } -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); -} - static struct msi_domain_ops hpet_msi_domain_ops = { .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, }; static struct msi_domain_info hpet_msi_domain_info = { @@ -568,7 +561,7 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) fwspec.param_count = 1; fwspec.param[0] = hpet_id; - parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI); if (!parent) { irq_domain_free_fwnode(fn); kfree(domain_info); @@ -707,7 +700,7 @@ static void __init hpet_select_clockevents(void) hpet_base.nr_clockevents = 0; - /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + /* No point if MSI is disabled or CPU has an Always Running APIC Timer */ if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) return; @@ -927,10 +920,7 @@ static bool __init mwait_pc10_supported(void) if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) return false; - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates); return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx & CPUID5_ECX_INTERRUPT_BREAK) && @@ -965,7 +955,7 @@ static bool __init mwait_pc10_supported(void) * and per CPU timer interrupts. * * The probability that this problem is going to be solved in the - * forseeable future is close to zero, so the kernel has to be cluttered + * foreseeable future is close to zero, so the kernel has to be cluttered * with heuristics to keep up with the ever growing amount of hardware and * firmware trainwrecks. Hopefully some day hardware people will understand * that the approach of "This can be fixed in software" is not sustainable. @@ -1392,12 +1382,6 @@ int hpet_set_periodic_freq(unsigned long freq) } EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); -int hpet_rtc_dropped_irq(void) -{ - return is_hpet_enabled(); -} -EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); - static void hpet_rtc_timer_reinit(void) { unsigned int delta; @@ -1438,7 +1422,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { - if (unlikely(mc146818_get_time(&curr_time) < 0)) { + if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) { pr_err_ratelimited("unable to read current time from RTC\n"); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2b7999a1a50a..80e262bb627f 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -8,6 +8,7 @@ #include <linux/timex.h> #include <linux/i8253.h> +#include <asm/hypervisor.h> #include <asm/apic.h> #include <asm/hpet.h> #include <asm/time.h> @@ -39,9 +40,15 @@ static bool __init use_pit(void) bool __init pit_timer_init(void) { - if (!use_pit()) + if (!use_pit()) { + /* + * Don't just ignore the PIT. Ensure it's stopped, because + * VMMs otherwise steal CPU time just to pointlessly waggle + * the (masked) IRQ. + */ + clockevent_i8253_disable(); return false; - + } clockevent_i8253_init(true); global_clock_event = &i8253_clockevent; return true; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 660b601f1d6c..f445bec516a0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_LOCAL_APIC INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), -# ifdef CONFIG_HAVE_KVM +# if IS_ENABLED(CONFIG_KVM) INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), @@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = { # endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_POSTED_MSI + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), +# endif #endif }; @@ -337,7 +340,7 @@ void idt_invalidate(void) load_idt(&idt); } -void __init alloc_intr_gate(unsigned int n, const void *addr) +void __init idt_install_sysvec(unsigned int n, const void *function) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) return; @@ -346,5 +349,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr) return; if (!WARN_ON(test_and_set_bit(n, system_vectors))) - set_intr_gate(n, addr); + set_intr_gate(n, function); } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 11761c124545..feca4f20b06a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,9 +22,13 @@ #include <asm/desc.h> #include <asm/traps.h> #include <asm/thermal.h> +#include <asm/posted_intr.h> +#include <asm/irq_remapping.h> +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR) #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> +#endif DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -164,7 +168,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) seq_printf(p, "%*s: ", prec, "PIN"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); @@ -182,6 +186,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_X86_POSTED_MSI + seq_printf(p, "%*s: ", prec, "PMN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->posted_msi_notification_count); + seq_puts(p, " Posted MSI notification event\n"); +#endif return 0; } @@ -240,24 +251,16 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -/* - * common_interrupt() handles all normal device IRQ's (the special SMP - * cross-CPU interrupts have their own entry points). - */ -DEFINE_IDTENTRY_IRQ(common_interrupt) +static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - - /* entry code tells RCU that we're not quiescent. Check it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + int ret = 0; desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - apic_eoi(); - + ret = -EINVAL; if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), @@ -267,6 +270,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) } } + return ret; +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + if (unlikely(call_irq_handler(vector, regs))) + apic_eoi(); + set_irq_regs(old_regs); } @@ -290,7 +310,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; @@ -334,12 +354,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) } #endif +#ifdef CONFIG_X86_POSTED_MSI + +/* Posted Interrupt Descriptors for coalesced MSIs to be posted */ +DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); + +void intel_posted_msi_init(void) +{ + u32 destination; + u32 apic_id; + + this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR); + + /* + * APIC destination ID is stored in bit 8:15 while in XAPIC mode. + * VT-d spec. CH 9.11 + */ + apic_id = this_cpu_read(x86_cpu_to_apicid); + destination = x2apic_enabled() ? apic_id : apic_id << 8; + this_cpu_write(posted_msi_pi_desc.ndst, destination); +} + +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg + * for checking and clearing posted interrupt request (PIR), a 256 bit field + * within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * to dispatch interrupt handlers. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +{ + int i, vec = FIRST_EXTERNAL_VECTOR; + unsigned long pir_copy[4]; + bool handled = false; + + for (i = 0; i < 4; i++) + pir_copy[i] = pir[i]; + + for (i = 0; i < 4; i++) { + if (!pir_copy[i]) + continue; + + pir_copy[i] = arch_xchg(&pir[i], 0); + handled = true; + } + + if (handled) { + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); + } + + return handled; +} + +/* + * Performance data shows that 3 is good enough to harvest 90+% of the benefit + * on high IRQ rate workload. + */ +#define MAX_POSTED_MSI_COALESCING_LOOP 3 + +/* + * For MSIs that are delivered as posted interrupts, the CPU notifications + * can be coalesced if the MSIs arrive in high frequency bursts. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct pi_desc *pid; + int i = 0; + + pid = this_cpu_ptr(&posted_msi_pi_desc); + + inc_irq_stat(posted_msi_notification_count); + irq_enter(); + + /* + * Max coalescing count includes the extra round of handle_pending_pir + * after clearing the outstanding notification bit. Hence, at most + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. + */ + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { + if (!handle_pending_pir(pid->pir64, regs)) + break; + } + + /* + * Clear outstanding notification bit to allow new IRQ notifications, + * do this last to maximize the window of interrupt coalescing. + */ + pi_clear_on(pid); + + /* + * There could be a race of PI notification and the clearing of ON bit, + * process PIR bits one last time such that handling the new interrupts + * are not delayed until the next IRQ. + */ + handle_pending_pir(pid->pir64, regs); + + apic_eoi(); + irq_exit(); + set_irq_regs(old_regs); +} +#endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irr, vector; + unsigned int vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; @@ -366,8 +513,7 @@ void fixup_irqs(void) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) { + if (is_vector_pending(vector)) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index fe0c859873d1..ade0043ce56e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,6 +18,7 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> +#include <linux/vmalloc.h> #include <asm/cpu_entry_area.h> #include <asm/softirq_stack.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c683666876f1..f79c5edc0b89 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/fred.h> #include <asm/prom.h> /* @@ -96,7 +97,11 @@ void __init native_init_IRQ(void) /* Execute any quirks before the call gates are initialised: */ x86_init.irqs.pre_vector_init(); - idt_setup_apic_and_irq_gates(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_complete_exception_setup(); + else + idt_setup_apic_and_irq_gates(); + lapic_assign_system_vectors(); if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) { diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9a7c03d47861..9cea1fc36c18 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/cpuset.h> +#include <linux/debugfs.h> #include <linux/mutex.h> #include <linux/sysctl.h> #include <linux/nodemask.h> @@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable; * of higher turbo frequency for cpus supporting Intel Turbo Boost Max * Technology 3.0. * - * It can be set via /proc/sys/kernel/sched_itmt_enabled + * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled */ -unsigned int __read_mostly sysctl_sched_itmt_enabled; +bool __read_mostly sysctl_sched_itmt_enabled; -static int sched_itmt_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static ssize_t sched_itmt_enabled_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned int old_sysctl; - int ret; + ssize_t result; + bool orig; - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); - return -EINVAL; - } - - old_sysctl = sysctl_sched_itmt_enabled; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + orig = sysctl_sched_itmt_enabled; + result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); - if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + if (sysctl_sched_itmt_enabled != orig) { x86_topology_update = true; rebuild_sched_domains(); } - mutex_unlock(&itmt_update_mutex); - - return ret; + return result; } -static struct ctl_table itmt_kern_table[] = { - { - .procname = "sched_itmt_enabled", - .data = &sysctl_sched_itmt_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_itmt_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, +static const struct file_operations dfs_sched_itmt_fops = { + .read = debugfs_read_file_bool, + .write = sched_itmt_enabled_write, + .open = simple_open, + .llseek = default_llseek, }; -static struct ctl_table_header *itmt_sysctl_header; +static struct dentry *dfs_sched_itmt; /** * sched_set_itmt_support() - Indicate platform supports ITMT @@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header; */ int sched_set_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (sched_itmt_capable) return 0; - } - itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); - if (!itmt_sysctl_header) { - mutex_unlock(&itmt_update_mutex); + dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled", + 0644, + arch_debugfs_dir, + &sysctl_sched_itmt_enabled, + &dfs_sched_itmt_fops); + if (IS_ERR_OR_NULL(dfs_sched_itmt)) { + dfs_sched_itmt = NULL; return -ENOMEM; } @@ -117,8 +109,6 @@ int sched_set_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); - mutex_unlock(&itmt_update_mutex); - return 0; } @@ -134,18 +124,15 @@ int sched_set_itmt_support(void) */ void sched_clear_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (!sched_itmt_capable) return; - } + sched_itmt_capable = false; - if (itmt_sysctl_header) { - unregister_sysctl_table(itmt_sysctl_header); - itmt_sysctl_header = NULL; - } + debugfs_remove(dfs_sched_itmt); + dfs_sched_itmt = NULL; if (sysctl_sched_itmt_enabled) { /* disable sched_itmt if we are no longer ITMT capable */ @@ -153,8 +140,6 @@ void sched_clear_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); } - - mutex_unlock(&itmt_update_mutex); } int arch_asym_cpu_priority(int cpu) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 578d16fc040f..cd8ed1edbf9e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/reboot.h> #include <linux/serial_8250.h> +#include <linux/acpi.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/acpi.h> @@ -89,7 +90,7 @@ static void __init jailhouse_x2apic_init(void) #endif } -static void __init jailhouse_get_smp_config(unsigned int early) +static void __init jailhouse_parse_smp_config(void) { struct ioapic_domain_cfg ioapic_cfg = { .type = IOAPIC_DOMAIN_STRICT, @@ -102,7 +103,7 @@ static void __init jailhouse_get_smp_config(unsigned int early) register_lapic_address(0xfee00000); for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) - generic_processor_info(setup_data.v1.cpu_ids[cpu]); + topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true); smp_found_config = 1; @@ -201,21 +202,23 @@ static void __init jailhouse_init_platform(void) struct setup_data header; void *mapping; - x86_init.irqs.pre_vector_init = x86_init_noop; - x86_init.timers.timer_init = jailhouse_timer_init; - x86_init.mpparse.get_smp_config = jailhouse_get_smp_config; - x86_init.pci.arch_init = jailhouse_pci_arch_init; + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = jailhouse_timer_init; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config; + x86_init.pci.arch_init = jailhouse_pci_arch_init; - x86_platform.calibrate_cpu = jailhouse_get_tsc; - x86_platform.calibrate_tsc = jailhouse_get_tsc; - x86_platform.get_wallclock = jailhouse_get_wallclock; - x86_platform.legacy.rtc = 0; - x86_platform.legacy.warm_reset = 0; - x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + x86_platform.calibrate_cpu = jailhouse_get_tsc; + x86_platform.calibrate_tsc = jailhouse_get_tsc; + x86_platform.get_wallclock = jailhouse_get_wallclock; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; - legacy_pic = &null_legacy_pic; + legacy_pic = &null_legacy_pic; - machine_ops.emergency_restart = jailhouse_no_restart; + machine_ops.emergency_restart = jailhouse_no_restart; while (pa_data) { mapping = early_memremap(pa_data, sizeof(header)); diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index a61c12c01270..68530fad05f7 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, cmdline_ptr[cmdline_len - 1] = '\0'; - pr_debug("Final command line is: %s\n", cmdline_ptr); + kexec_dprintk("Final command line is: %s\n", cmdline_ptr); cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; cmdline_ext_32 = cmdline_ptr_phys >> 32; @@ -263,16 +263,23 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); +#ifdef CONFIG_CRASH_DUMP if (image->type == KEXEC_TYPE_CRASH) { ret = crash_setup_memmap_entries(image, params); if (ret) return ret; } else +#endif setup_e820_entries(params); nr_e820_entries = params->e820_entries; + kexec_dprintk("E820 memmap:\n"); for (i = 0; i < nr_e820_entries; i++) { + kexec_dprintk("%016llx-%016llx (%d)\n", + params->e820_table[i].addr, + params->e820_table[i].addr + params->e820_table[i].size - 1, + params->e820_table[i].type); if (params->e820_table[i].type != E820_TYPE_RAM) continue; start = params->e820_table[i].addr; @@ -424,16 +431,18 @@ static void *bzImage64_load(struct kimage *image, char *kernel, * command line. Make sure it does not overflow */ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { - pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); + pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); return ERR_PTR(-EINVAL); } +#ifdef CONFIG_CRASH_DUMP /* Allocate and load backup region */ if (image->type == KEXEC_TYPE_CRASH) { ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); } +#endif /* * Load purgatory. For 64bit entry point, purgatory code can be @@ -445,7 +454,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ERR_PTR(ret); } - pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem); + kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem); /* @@ -490,23 +499,26 @@ static void *bzImage64_load(struct kimage *image, char *kernel, if (ret) goto out_free_params; bootparam_load_addr = kbuf.mem; - pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - bootparam_load_addr, kbuf.bufsz, kbuf.bufsz); + kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, kbuf.bufsz, kbuf.memsz); /* Load kernel */ kbuf.buffer = kernel + kern16_size; kbuf.bufsz = kernel_len - kern16_size; kbuf.memsz = PAGE_ALIGN(header->init_size); kbuf.buf_align = header->kernel_alignment; - kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + if (header->pref_address < MIN_KERNEL_LOAD_ADDR) + kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; + else + kbuf.buf_min = header->pref_address; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; ret = kexec_add_buffer(&kbuf); if (ret) goto out_free_params; kernel_load_addr = kbuf.mem; - pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - kernel_load_addr, kbuf.bufsz, kbuf.memsz); + kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kbuf.bufsz, kbuf.memsz); /* Load initrd high */ if (initrd) { @@ -520,8 +532,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, goto out_free_params; initrd_load_addr = kbuf.mem; - pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - initrd_load_addr, initrd_len, initrd_len); + kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); setup_initrd(params, initrd_load_addr, initrd_len); } diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index c993521d4933..e772276f5aa9 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -78,7 +78,7 @@ #endif /* Ensure if the instruction can be boostable */ -extern int can_boost(struct insn *insn, void *orig_addr); +extern bool can_boost(struct insn *insn, void *orig_addr); /* Recover instruction if given address is probed */ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a0ce46c0a2d8..72e6a45e7ec2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -40,12 +40,12 @@ #include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/kasan.h> -#include <linux/moduleloader.h> #include <linux/objtool.h> #include <linux/vmalloc.h> #include <linux/pgtable.h> #include <linux/set_memory.h> #include <linux/cfi.h> +#include <linux/execmem.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -137,14 +137,14 @@ NOKPROBE_SYMBOL(synthesize_relcall); * Returns non-zero if INSN is boostable. * RIP relative instructions are adjusted at copying time in 64 bits mode */ -int can_boost(struct insn *insn, void *addr) +bool can_boost(struct insn *insn, void *addr) { kprobe_opcode_t opcode; insn_byte_t prefix; int i; if (search_exception_tables((unsigned long)addr)) - return 0; /* Page fault may occur on this address. */ + return false; /* Page fault may occur on this address. */ /* 2nd-byte opcode */ if (insn->opcode.nbytes == 2) @@ -152,7 +152,7 @@ int can_boost(struct insn *insn, void *addr) (unsigned long *)twobyte_is_boostable); if (insn->opcode.nbytes != 1) - return 0; + return false; for_each_insn_prefix(insn, i, prefix) { insn_attr_t attr; @@ -160,7 +160,7 @@ int can_boost(struct insn *insn, void *addr) attr = inat_get_opcode_attribute(prefix); /* Can't boost Address-size override prefix and CS override prefix */ if (prefix == 0x2e || inat_is_address_size_prefix(attr)) - return 0; + return false; } opcode = insn->opcode.bytes[0]; @@ -169,24 +169,35 @@ int can_boost(struct insn *insn, void *addr) case 0x62: /* bound */ case 0x70 ... 0x7f: /* Conditional jumps */ case 0x9a: /* Call far */ - case 0xc0 ... 0xc1: /* Grp2 */ case 0xcc ... 0xce: /* software exceptions */ - case 0xd0 ... 0xd3: /* Grp2 */ case 0xd6: /* (UD) */ case 0xd8 ... 0xdf: /* ESC */ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */ case 0xe8 ... 0xe9: /* near Call, JMP */ case 0xeb: /* Short JMP */ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */ + /* ... are not boostable */ + return false; + case 0xc0 ... 0xc1: /* Grp2 */ + case 0xd0 ... 0xd3: /* Grp2 */ + /* + * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved. + */ + return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110; case 0xf6 ... 0xf7: /* Grp3 */ + /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */ + return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001; case 0xfe: /* Grp4 */ - /* ... are not boostable */ - return 0; + /* Only INC and DEC are boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001; case 0xff: /* Grp5 */ - /* Only indirect jmp is boostable */ - return X86_MODRM_REG(insn->modrm.bytes[0]) == 4; + /* Only INC, DEC, and indirect JMP are boostable */ + return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 || + X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100; default: - return 1; + return true; } } @@ -252,21 +263,40 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add return __recover_probed_insn(buf, addr); } -/* Check if paddr is at an instruction boundary */ -static int can_probe(unsigned long paddr) +/* Check if insn is INT or UD */ +static inline bool is_exception_insn(struct insn *insn) +{ + /* UD uses 0f escape */ + if (insn->opcode.bytes[0] == 0x0f) { + /* UD0 / UD1 / UD2 */ + return insn->opcode.bytes[1] == 0xff || + insn->opcode.bytes[1] == 0xb9 || + insn->opcode.bytes[1] == 0x0b; + } + + /* INT3 / INT n / INTO / INT1 */ + return insn->opcode.bytes[0] == 0xcc || + insn->opcode.bytes[0] == 0xcd || + insn->opcode.bytes[0] == 0xce || + insn->opcode.bytes[0] == 0xf1; +} + +/* + * Check if paddr is at an instruction boundary and that instruction can + * be probed + */ +static bool can_probe(unsigned long paddr) { unsigned long addr, __addr, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) - return 0; + return false; /* Decode instructions */ addr = paddr - offset; while (addr < paddr) { - int ret; - /* * Check if the instruction has been modified by another * kprobe, in which case we replace the breakpoint by the @@ -277,11 +307,10 @@ static int can_probe(unsigned long paddr) */ __addr = recover_probed_instruction(buf, addr); if (!__addr) - return 0; + return false; - ret = insn_decode_kernel(&insn, (void *)__addr); - if (ret < 0) - return 0; + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return false; #ifdef CONFIG_KGDB /* @@ -290,10 +319,26 @@ static int can_probe(unsigned long paddr) */ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && kgdb_has_hit_break(addr)) - return 0; + return false; #endif addr += insn.length; } + + /* Check if paddr is at an instruction boundary */ + if (addr != paddr) + return false; + + __addr = recover_probed_instruction(buf, addr); + if (!__addr) + return false; + + if (insn_decode_kernel(&insn, (void *)__addr) < 0) + return false; + + /* INT and UD are special and should not be kprobed */ + if (is_exception_insn(&insn)) + return false; + if (IS_ENABLED(CONFIG_CFI_CLANG)) { /* * The compiler generates the following instruction sequence @@ -308,13 +353,6 @@ static int can_probe(unsigned long paddr) * Also, these movl and addl are used for showing expected * type. So those must not be touched. */ - __addr = recover_probed_instruction(buf, addr); - if (!__addr) - return 0; - - if (insn_decode_kernel(&insn, (void *)__addr) < 0) - return 0; - if (insn.opcode.value == 0xBA) offset = 12; else if (insn.opcode.value == 0x3) @@ -324,18 +362,27 @@ static int can_probe(unsigned long paddr) /* This movl/addl is used for decoding CFI. */ if (is_cfi_trap(addr + offset)) - return 0; + return false; } out: - return (addr == paddr); + return true; } /* If x86 supports IBT (ENDBR) it must be skipped. */ kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - if (is_endbr(*(u32 *)addr)) { + u32 insn; + + /* + * Since 'addr' is not guaranteed to be safe to access, use + * copy_from_kernel_nofault() to read the instruction: + */ + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) + return NULL; + + if (is_endbr(insn)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; @@ -448,7 +495,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb77..2be55ec3f392 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/ftrace.h> +#include <asm/text-patching.h> #include "common.h" @@ -21,6 +22,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; @@ -33,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = regs->ip; + unsigned long orig_ip = instruction_pointer(regs); + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); + instruction_pointer_set(regs, ip + INT3_INSN_SIZE); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; if (unlikely(p->post_handler)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - regs->ip = orig_ip; + /* Recover IP address */ + instruction_pointer_set(regs, orig_ip); } /* * If pre_handler returns !0, it changes regs->ip. We have to diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 517821b48391..36d6809c6c9e 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr) * However, the kernel built with retpolines or IBT has jump * tables disabled so the check can be skipped altogether. */ - if (!IS_ENABLED(CONFIG_RETPOLINE) && + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && !IS_ENABLED(CONFIG_X86_KERNEL_IBT) && insn_is_indirect_jump(&insn)) return 0; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 257892fcefa7..b68d4be9464e 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj, static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { memcpy(buf, (void *)&boot_params + off, count); return count; } -static struct bin_attribute boot_params_data_attr = { +static const struct bin_attribute boot_params_data_attr = { .attr = { .name = "data", .mode = S_IRUGO, }, - .read = boot_params_data_read, + .read_new = boot_params_data_read, .size = sizeof(boot_params), }; @@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = { NULL, }; -static struct bin_attribute *boot_params_data_attrs[] = { +static const struct bin_attribute *const boot_params_data_attrs[] = { &boot_params_data_attr, NULL, }; static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, - .bin_attrs = boot_params_data_attrs, + .bin_attrs_new = boot_params_data_attrs, }; static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) @@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj, static ssize_t setup_data_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = { .name = "data", .mode = S_IRUGO, }, - .read = setup_data_data_read, + .read_new = setup_data_data_read, }; static struct attribute *setup_data_type_attrs[] = { @@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = { NULL, }; -static struct bin_attribute *setup_data_data_attrs[] = { +static const struct bin_attribute *const setup_data_data_attrs[] = { &data_attr, NULL, }; static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, - .bin_attrs = setup_data_data_attrs, + .bin_attrs_new = setup_data_data_attrs, }; static int __init create_setup_data_node(struct kobject *parent, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0ddb3bd0f1aa..7a422a6c5983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -37,6 +37,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> #include <asm/ptrace.h> @@ -44,7 +45,7 @@ #include <asm/svm.h> #include <asm/e820/api.h> -DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); +DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled); static int kvmapf = 1; @@ -65,6 +66,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +246,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,7 +297,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); @@ -362,7 +364,7 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -383,11 +385,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } @@ -434,7 +436,8 @@ static void __init sev_map_percpu_data(void) { int cpu; - if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) + if (cc_vendor != CC_VENDOR_AMD || + !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return; for_each_possible_cpu(cpu) { @@ -769,7 +772,7 @@ static struct notifier_block kvm_pv_reboot_nb = { * won't be valid. In cases like kexec, in which you install a new kernel, this * means a random memory location will be kept being written. */ -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP static void kvm_crash_shutdown(struct pt_regs *regs) { kvm_guest_cpu_offline(true); @@ -803,8 +806,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long); "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \ "setne %al\n\t" -DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted, - PV_VCPU_PREEMPTED_ASM, .text); +DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted, + PV_VCPU_PREEMPTED_ASM, .text); #endif static void __init kvm_guest_init(void) @@ -829,7 +832,7 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); } #ifdef CONFIG_SMP @@ -852,7 +855,7 @@ static void __init kvm_guest_init(void) kvm_guest_cpu_init(); #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = kvm_crash_shutdown; #endif @@ -942,7 +945,7 @@ static void __init kvm_init_platform(void) * Reset the host's shared pages list related to kernel * specific page encryption status settings before we load a * new kernel by kexec. Reset the page encryption status - * during early boot intead of just before kexec to avoid SMP + * during early boot instead of just before kexec to avoid SMP * races during kvm_pv_guest_cpu_reboot(). * NOTE: We cannot reset the complete shared pages list * here as we need to retain the UEFI/OVMF firmware @@ -978,6 +981,9 @@ static void __init kvm_init_platform(void) } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; + + /* Set WB as the default cache mode for SEV-SNP and TDX */ + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index fb8f52149be9..5b2c15214a6b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -24,8 +24,8 @@ static int kvmclock __initdata = 1; static int kvmclock_vsyscall __initdata = 1; -static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static int msr_kvm_system_time __ro_after_init; +static int msr_kvm_wall_clock __ro_after_init; static u64 kvm_sched_clock_offset __ro_after_init; static int __init parse_no_kvmclock(char *arg) @@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg) } early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); -/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */ #define HVC_BOOT_ARRAY_SIZE \ (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) @@ -154,15 +154,15 @@ static int kvm_cs_enable(struct clocksource *cs) return 0; } -struct clocksource kvm_clock = { +static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .id = CSID_X86_KVM_CLK, .enable = kvm_cs_enable, }; -EXPORT_SYMBOL_GPL(kvm_clock); static void kvm_register_clock(char *txt) { @@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { - native_write_msr(msr_kvm_system_time, 0, 0); + if (msr_kvm_system_time) + native_write_msr(msr_kvm_system_time, 0, 0); } static void __init kvmclock_init_mem(void) @@ -294,7 +295,10 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + } else { return; } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index adc67f98819a..0f19ef355f5f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,7 +7,7 @@ * This handles calls from both 32bit and 64bit mode. * * Lock order: - * contex.ldt_usr_sem + * context.ldt_usr_sem * mmap_lock * context.lock */ @@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm) /* * Any change to mm->context.ldt is followed by an IPI to all * CPUs with the mm active. The LDT will not be freed until - * after the IPI is handled by all such CPUs. This means that, + * after the IPI is handled by all such CPUs. This means that * if the ldt_struct changes before we return, the values we see * will be safe, and the new values will be loaded before we run * any user code. @@ -184,7 +184,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries) return new_ldt; } -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void do_sanity_check(struct mm_struct *mm, bool had_kernel_mapping, @@ -377,7 +377,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); } -#else /* !CONFIG_PAGE_TABLE_ISOLATION */ +#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) @@ -388,11 +388,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) { } -#endif /* CONFIG_PAGE_TABLE_ISOLATION */ +#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) { -#ifdef CONFIG_PAGE_TABLE_ISOLATION +#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION struct mmu_gather tlb; unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; @@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , } /* * The SYSCALL_DEFINE() macros give us an 'unsigned long' - * return type, but tht ABI for sys_modify_ldt() expects + * return type, but the ABI for sys_modify_ldt() expects * 'int'. This cast gives us an int-sized value in %rax * for the return code. The 'unsigned' is necessary so * the compiler does not try to sign-extend the negative diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1b373d79cedc..80265162aeff 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -160,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image) */ void machine_kexec(struct kimage *image) { + relocate_kernel_fn *relocate_kernel_ptr; unsigned long page_list[PAGES_NR]; void *control_page; int save_ftrace_enabled; - asmlinkage unsigned long - (*relocate_kernel_ptr)(unsigned long indirection_page, - unsigned long control_page, - unsigned long start_address, - unsigned int has_pae, - unsigned int preserve_context); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 1a3e2c05a8a5..a68f5a0a9f37 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/cpu.h> +#include <asm/efi.h> #ifdef CONFIG_ACPI /* @@ -42,12 +43,9 @@ struct init_pgtable_data { static int mem_region_callback(struct resource *res, void *arg) { struct init_pgtable_data *data = arg; - unsigned long mstart, mend; - - mstart = res->start; - mend = mstart + resource_size(res) - 1; - return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend); + return kernel_ident_mapping_init(data->info, data->level4p, + res->start, res->end + 1); } static int @@ -90,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; + void *kaddr; + int ret; if (!efi_enabled(EFI_BOOT)) return 0; @@ -105,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) if (!mstart) return 0; + ret = kernel_ident_mapping_init(info, level4p, mstart, mend); + if (ret) + return ret; + + kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); + if (!kaddr) { + pr_err("Could not map UEFI system table\n"); + return -ENOMEM; + } + + mstart = efi_config_table; + + if (efi_enabled(EFI_64BIT)) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; + } else { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; + } + + memunmap(kaddr); + return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; @@ -122,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image) image->arch.pte = NULL; } -static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, + unsigned long control_page) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; @@ -132,8 +157,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) pmd_t *pmd; pte_t *pte; - vaddr = (unsigned long)relocate_kernel; - paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + /* + * For the transition to the identity mapped page tables, the control + * code page also needs to be mapped at the virtual address it starts + * off running from. + */ + vaddr = (unsigned long)__va(control_page); + paddr = control_page; pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); @@ -192,7 +222,7 @@ static void *alloc_pgt_page(void *data) return p; } -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +static int init_pgtable(struct kimage *image, unsigned long control_page) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, @@ -201,12 +231,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; - pgd_t *level4p; int result; int i; - level4p = (pgd_t *)__va(start_pgtable); - clear_page(level4p); + image->arch.pgd = alloc_pgt_page(image); + if (!image->arch.pgd) + return -ENOMEM; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; @@ -220,8 +250,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; } @@ -236,8 +266,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; @@ -247,15 +277,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ - result = map_efi_systab(&info, level4p); + result = map_efi_systab(&info, image->arch.pgd); if (result) return result; - result = map_acpi_tables(&info, level4p); + result = map_acpi_tables(&info, image->arch.pgd); if (result) return result; - return init_transition_pgtable(image, level4p); + /* + * This must be last because the intermediate page table pages it + * allocates will not be control pages and may overlap the image. + */ + return init_transition_pgtable(image, image->arch.pgd, control_page); } static void load_segments(void) @@ -272,22 +306,35 @@ static void load_segments(void) int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable; + void *control_page = page_address(image->control_code_page); + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); + result = init_pgtable(image, __pa(control_page)); if (result) return result; + kexec_va_control_page = (unsigned long)control_page; + kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); + + set_memory_rox((unsigned long)control_page, 1); return 0; } void machine_kexec_cleanup(struct kimage *image) { + void *control_page = page_address(image->control_code_page); + + set_memory_nx((unsigned long)control_page, 1); + set_memory_rw((unsigned long)control_page, 1); + free_transition_pgtable(image); } @@ -295,11 +342,19 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -void machine_kexec(struct kimage *image) +void __nocfi machine_kexec(struct kimage *image) { - unsigned long page_list[PAGES_NR]; - void *control_page; + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + relocate_kernel_fn *relocate_kernel_ptr; + unsigned int host_mem_enc_active; int save_ftrace_enabled; + void *control_page; + + /* + * This must be done before load_segments() since if call depth tracking + * is used then GS must be valid to make any function calls. + */ + host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -326,17 +381,13 @@ void machine_kexec(struct kimage *image) #endif } - control_page = page_address(image->control_code_page) + PAGE_SIZE; - __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + control_page = page_address(image->control_code_page); - page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_TABLE_PAGE] = - (unsigned long)__pa(page_address(image->control_code_page)); - - if (image->type == KEXEC_TYPE_DEFAULT) - page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) - << PAGE_SHIFT); + /* + * Allow for the possibility that relocate_kernel might not be at + * the very start of the page. + */ + relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; /* * The segment registers are funny things, they have both a @@ -357,11 +408,11 @@ void machine_kexec(struct kimage *image) native_gdt_invalidate(); /* now call it */ - image->start = relocate_kernel((unsigned long)image->head, - (unsigned long)page_list, - image->start, - image->preserve_context, - cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + image->start = relocate_kernel_ptr((unsigned long)image->head, + virt_to_phys(control_page), + image->start, + image->preserve_context, + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -511,6 +562,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image) } #endif /* CONFIG_KEXEC_FILE */ +#ifdef CONFIG_CRASH_DUMP + static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { @@ -540,8 +593,7 @@ static void kexec_mark_crashkres(bool protect) /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); - /* Control code page is located in the 2nd page. */ - kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + kexec_mark_range(crashk_res.start, control - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } @@ -555,6 +607,7 @@ void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } +#endif /* * During a traditional boot under SME, SME will encrypt the kernel, diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index c94dec6a1834..1f54eedc3015 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -9,6 +9,7 @@ #include <linux/pci.h> #include <linux/dmi.h> #include <linux/range.h> +#include <linux/acpi.h> #include <asm/pci-direct.h> #include <linux/sort.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5f71a0cf4399..8984abd91c00 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -36,57 +36,6 @@ do { \ } while (0) #endif -#ifdef CONFIG_RANDOMIZE_BASE -static unsigned long module_load_offset; - -/* Mutex protects the module_load_offset. */ -static DEFINE_MUTEX(module_kaslr_mutex); - -static unsigned long int get_module_load_offset(void) -{ - if (kaslr_enabled()) { - mutex_lock(&module_kaslr_mutex); - /* - * Calculate the module_load_offset the first time this - * code is called. Once calculated it stays the same until - * reboot. - */ - if (module_load_offset == 0) - module_load_offset = - get_random_u32_inclusive(1, 1024) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - } - return module_load_offset; -} -#else -static unsigned long int get_module_load_offset(void) -{ - return 0; -} -#endif - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - - return p; -} - #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, @@ -197,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - if (memcmp(loc, &zero, size)) { + void *wr_loc = module_writable_address(me, loc); + + if (memcmp(wr_loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &val, size); + write(wr_loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } + /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -275,8 +227,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, *locks = NULL, - *para = NULL, *orc = NULL, *orc_ip = NULL, + const Elf_Shdr *s, *alt = NULL, + *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; @@ -284,10 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - if (!strcmp(".parainstructions", secstrings + s->sh_name)) - para = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -304,14 +252,6 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } - /* - * See alternative_instructions() for the ordering rules between the - * various patching types. - */ - if (para) { - void *pseg = (void *)para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -326,22 +266,22 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size); + apply_retpolines(rseg, rseg + retpolines->sh_size, me); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size); + apply_returns(rseg, rseg + returns->sh_size, me); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); + apply_alternatives(aseg, aseg + alt->sh_size, me); } - if (calls || para) { + if (calls || alt) { struct callthunk_sites cs = {}; if (calls) { @@ -349,17 +289,37 @@ int module_finalize(const Elf_Ehdr *hdr, cs.call_end = (void *)calls->sh_addr + calls->sh_size; } - if (para) { - cs.pv_start = (void *)para->sh_addr; - cs.pv_end = (void *)para->sh_addr + para->sh_size; + if (alt) { + cs.alt_start = (void *)alt->sh_addr; + cs.alt_end = (void *)alt->sh_addr + alt->sh_size; } callthunks_patch_module_calls(&cs, me); } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); } + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *locks = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + } + if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -369,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr, text, text_end); } - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - return 0; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b223922248e9..4a1b1b28abf9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -36,6 +36,8 @@ * Checksum an MP configuration block. */ +static unsigned int num_procs __initdata; + static int __init mpf_checksum(unsigned char *mp, int len) { int sum = 0; @@ -50,16 +52,15 @@ static void __init MP_processor_info(struct mpc_cpu *m) { char *bootup_cpu = ""; - if (!(m->cpuflag & CPU_ENABLED)) { - disabled_cpus++; + topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED); + if (!(m->cpuflag & CPU_ENABLED)) return; - } if (m->cpuflag & CPU_BOOTPROCESSOR) bootup_cpu = " (Bootup-CPU)"; pr_info("Processor #%d%s\n", m->apicid, bootup_cpu); - generic_processor_info(m->apicid); + num_procs++; } #ifdef CONFIG_X86_IO_APIC @@ -67,7 +68,7 @@ static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); + apic_pr_verbose("Bus #%d is %s\n", m->busid, str); } static void __init MP_bus_info(struct mpc_bus *m) @@ -196,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; - /* Initialize the lapic mapping */ - if (!acpi_lapic) - register_lapic_address(mpc->lapic); - - if (early) + if (early) { + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); return 1; + } /* Now process the configuration blocks. */ while (count < mpc->length) { @@ -236,9 +237,9 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) } } - if (!num_processors) + if (!num_procs && !acpi_lapic) pr_err("MPTABLE: no processors registered!\n"); - return num_processors; + return num_procs || acpi_lapic; } #ifdef CONFIG_X86_IO_APIC @@ -416,7 +417,7 @@ static unsigned long __init get_mpc_size(unsigned long physptr) mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; early_memunmap(mpc, PAGE_SIZE); - apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size); return size; } @@ -473,7 +474,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -void __init default_get_smp_config(unsigned int early) +static __init void mpparse_get_smp_config(unsigned int early) { struct mpf_intel *mpf; @@ -529,8 +530,8 @@ void __init default_get_smp_config(unsigned int early) } else BUG(); - if (!early) - pr_info("Processors: %d\n", num_processors); + if (!early && !acpi_lapic) + pr_info("Processors: %d\n", num_procs); /* * Only use the first configuration found. */ @@ -538,6 +539,16 @@ out: early_memunmap(mpf, sizeof(*mpf)); } +void __init mpparse_parse_early_smp_config(void) +{ + mpparse_get_smp_config(true); +} + +void __init mpparse_parse_smp_config(void) +{ + mpparse_get_smp_config(false); +} + static void __init smp_reserve_memory(struct mpf_intel *mpf) { memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); @@ -549,8 +560,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; int ret = 0; - apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", - base, base + length - 1); + apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -587,7 +597,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) return ret; } -void __init default_find_smp_config(void) +void __init mpparse_find_mptable(void) { unsigned int address; @@ -672,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) { int i; - apic_printk(APIC_VERBOSE, "OLD "); + apic_pr_verbose("OLD "); print_mp_irq_info(m); i = get_MP_intsrc_index(m); if (i > 0) { memcpy(m, &mp_irqs[i], sizeof(*m)); - apic_printk(APIC_VERBOSE, "NEW "); + apic_pr_verbose("NEW "); print_mp_irq_info(&mp_irqs[i]); return; } @@ -761,7 +771,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, continue; if (nr_m_spare > 0) { - apic_printk(APIC_VERBOSE, "*NEW* found\n"); + apic_pr_verbose("*NEW* found\n"); nr_m_spare--; memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); m_spare[nr_m_spare] = NULL; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 17e955ab69fe..ed163c8c8604 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -35,6 +35,7 @@ #include <asm/nospec-branch.h> #include <asm/microcode.h> #include <asm/sev.h> +#include <asm/fred.h> #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> @@ -303,13 +304,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) __this_cpu_add(nmi_stats.unknown, 1); - pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); if (unknown_nmi_panic || panic_on_unrecovered_nmi) nmi_panic(regs, "NMI: Not continuing"); - pr_emerg("Dazed and confused, but trying to continue\n"); + pr_emerg_ratelimited("Dazed and confused, but trying to continue\n"); } NOKPROBE_SYMBOL(unknown_nmi_error); @@ -502,7 +503,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi) if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) raw_atomic_long_inc(&nsp->idt_calls); - if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) { + if (arch_cpu_is_offline(smp_processor_id())) { if (microcode_nmi_handler_enabled()) microcode_offline_nmi_handler(); return; @@ -563,9 +564,6 @@ nmi_restart: } if (this_cpu_dec_return(nmi_state)) goto nmi_restart; - - if (user_mode(regs)) - mds_user_clear_cpu_buffers(); } #if IS_ENABLED(CONFIG_KVM_INTEL) @@ -582,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); static char *nmi_check_stall_msg[] = { /* */ -/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */ /* | +------ cpu_is_offline(cpu) */ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ /* | | | NMI handler has been invoked. */ @@ -630,27 +628,72 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) nmi_seq = READ_ONCE(nsp->idt_nmi_seq); if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { msgp = "CPU entered NMI handler function, but has not exited"; - } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { - msgp = "CPU is handling NMIs"; - } else { - idx = ((nsp->idt_seq_snap & 0x1) << 2) | + } else if (nsp->idt_nmi_seq_snap == nmi_seq || + nsp->idt_nmi_seq_snap + 1 == nmi_seq) { + idx = ((nmi_seq & 0x1) << 2) | (cpu_is_offline(cpu) << 1) | (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); msgp = nmi_check_stall_msg[idx]; if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) modp = ", but OK because ignore_nmis was set"; - if (nmi_seq & ~0x1) - msghp = " (CPU currently in NMI handler function)"; - else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) msghp = " (CPU exited one NMI handler function)"; + else if (nmi_seq & 0x1) + msghp = " (CPU currently in NMI handler function)"; + else + msghp = " (CPU was never in an NMI handler function)"; + } else { + msgp = "CPU is handling NMIs"; } - pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", - __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp); + pr_alert("%s: last activity: %lu jiffies ago.\n", + __func__, j - READ_ONCE(nsp->recv_jiffies)); } } #endif +#ifdef CONFIG_X86_FRED +/* + * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED + * event delivery, i.e., there is no problem of transient states. + * And NMI unblocking only happens when the stack frame indicates + * that so should happen. + * + * Thus, the NMI entry stub for FRED is really straightforward and + * as simple as most exception handlers. As such, #DB is allowed + * during NMI handling. + */ +DEFINE_FREDENTRY_NMI(exc_nmi) +{ + irqentry_state_t irq_state; + + if (arch_cpu_is_offline(smp_processor_id())) { + if (microcode_nmi_handler_enabled()) + microcode_offline_nmi_handler(); + return; + } + + /* + * Save CR2 for eventual restore to cover the case where the NMI + * hits the VMENTER/VMEXIT region where guest CR2 is life. This + * prevents guest state corruption in case that the NMI handler + * takes a page fault. + */ + this_cpu_write(nmi_cr2, read_cr2()); + + irq_state = irqentry_nmi_enter(regs); + + inc_irq_stat(__nmi_count); + default_do_nmi(regs); + + irqentry_nmi_exit(regs, irq_state); + + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) + write_cr2(this_cpu_read(nmi_cr2)); +} +#endif + void stop_nmi(void) { ignore_nmis++; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97f1436c1a20..1ccaa3397a67 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -34,14 +34,8 @@ #include <asm/io_bitmap.h> #include <asm/gsseg.h> -/* - * nop stub, which must not clobber anything *including the stack* to - * avoid confusing the entry prologues. - */ -DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text); - /* stub always returning 0. */ -DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text); +DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { @@ -49,63 +43,36 @@ void __init default_banner(void) pv_info.name); } -/* Undefined instruction for dealing with missing ops pointers. */ -noinstr void paravirt_BUG(void) -{ - BUG(); -} - -static unsigned paravirt_patch_call(void *insn_buff, const void *target, - unsigned long addr, unsigned len) -{ - __text_gen_insn(insn_buff, CALL_INSN_OPCODE, - (void *)addr, target, CALL_INSN_SIZE); - return CALL_INSN_SIZE; -} - #ifdef CONFIG_PARAVIRT_XXL -DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text); -DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text); -DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); +DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); +DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); +DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_disable(&virt_spin_lock_key); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); } +#ifndef CONFIG_PT_RECLAIM static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { - tlb_remove_page(tlb, table); -} + struct ptdesc *ptdesc = (struct ptdesc *)table; -unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, - unsigned int len) + pagetable_dtor(ptdesc); + tlb_remove_page(tlb, ptdesc_page(ptdesc)); +} +#else +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { - /* - * Neat trick to map patch type back to the call within the - * corresponding structure. - */ - void *opfunc = *((void **)&pv_ops + type); - unsigned ret; - - if (opfunc == NULL) - /* If there's no function, patch it with paravirt_BUG() */ - ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len); - else if (opfunc == _paravirt_nop) - ret = 0; - else - /* Otherwise call the function. */ - ret = paravirt_patch_call(insn_buff, opfunc, addr, len); - - return ret; + tlb_remove_table(tlb, table); } +#endif struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -159,11 +126,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val) native_set_debugreg(regno, val); } -noinstr void pv_native_wbinvd(void) -{ - native_wbinvd(); -} - static noinstr void pv_native_safe_halt(void) { native_safe_halt(); @@ -191,7 +153,6 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, - .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f323d83e40a7..6267363e0189 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void) swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } -/* - * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel - * parameter documentation. - */ static __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 319fef37d9dc..cc2c34ba7228 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -203,16 +203,6 @@ void __init probe_roms(void) unsigned char c; int i; - /* - * The ROM memory range is not part of the e820 table and is therefore not - * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted - * memory, and SNP requires encrypted memory to be validated before access. - * Do that here. - */ - snp_prep_memory(video_rom_resource.start, - ((system_rom_resource.end + 1) - video_rom_resource.start), - SNP_PAGE_STATE_PRIVATE); - /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b6f4e8399fca..6da6769d7254 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,6 +30,7 @@ #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> +#include <asm/cpuid.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> @@ -477,7 +478,7 @@ void native_tss_update_io_bitmap(void) /* * Make sure that the TSS limit is covering the IO bitmap. It might have * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O - * access from user space to trigger a #GP because tbe bitmap is outside + * access from user space to trigger a #GP because the bitmap is outside * the TSS limit. */ refresh_tss_limit(); @@ -825,7 +826,7 @@ void __noreturn stop_this_cpu(void *dummy) * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) - native_wbinvd(); + wbinvd(); /* * This brings a cache line back and dirties it, but @@ -835,42 +836,24 @@ void __noreturn stop_this_cpu(void *dummy) */ cpumask_clear_cpu(cpu, &cpus_stop_mask); +#ifdef CONFIG_SMP + if (smp_ops.stop_this_cpu) { + smp_ops.stop_this_cpu(); + BUG(); + } +#endif + for (;;) { /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the - * native_wbinvd() above. + * wbinvd() above. */ native_halt(); } } /* - * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power - * states (local apic timer and TSC stop). - * - * XXX this function is completely buggered vs RCU and tracing. - */ -static void amd_e400_idle(void) -{ - /* - * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E - * gets set after static_cpu_has() places have been converted via - * alternatives. - */ - if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { - default_idle(); - return; - } - - tick_broadcast_enter(); - - default_idle(); - - tick_broadcast_exit(); -} - -/* * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf * exists and whenever MONITOR/MWAIT extensions are present there is at * least one C1 substate. @@ -878,36 +861,37 @@ static void amd_e400_idle(void) * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait * is passed to kernel commandline parameter. */ -static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +static __init bool prefer_mwait_c1_over_halt(void) { + const struct cpuinfo_x86 *c = &boot_cpu_data; u32 eax, ebx, ecx, edx; - /* User has disallowed the use of MWAIT. Fallback to HALT */ - if (boot_option_idle_override == IDLE_NOMWAIT) - return 0; + /* If override is enforced on the command line, fall back to HALT. */ + if (boot_option_idle_override != IDLE_NO_OVERRIDE) + return false; /* MWAIT is not supported on this platform. Fallback to HALT */ if (!cpu_has(c, X86_FEATURE_MWAIT)) - return 0; + return false; - /* Monitor has a bug. Fallback to HALT */ - if (boot_cpu_has_bug(X86_BUG_MONITOR)) - return 0; + /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */ + if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) + return false; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT * with EAX=0, ECX=0. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) - return 1; + return true; /* * If MWAIT extensions are available, there should be at least one * MWAIT C1 substate present. */ - return (edx & MWAIT_C1_SUBSTATE_MASK); + return !!(edx & MWAIT_C1_SUBSTATE_MASK); } /* @@ -933,26 +917,27 @@ static __cpuidle void mwait_idle(void) __current_clr_polling(); } -void select_idle_routine(const struct cpuinfo_x86 *c) +void __init select_idle_routine(void) { -#ifdef CONFIG_SMP - if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) - pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); -#endif - if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) + if (boot_option_idle_override == IDLE_POLL) { + if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); return; + } - if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { - pr_info("using AMD E400 aware idle routine\n"); - static_call_update(x86_idle, amd_e400_idle); - } else if (prefer_mwait_c1_over_halt(c)) { + /* Required to guard against xen_set_default_idle() */ + if (x86_idle_set()) + return; + + if (prefer_mwait_c1_over_halt()) { pr_info("using mwait in idle threads\n"); static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); static_call_update(x86_idle, tdx_safe_halt); - } else + } else { static_call_update(x86_idle, default_idle); + } } void amd_e400_c1e_apic_setup(void) @@ -985,7 +970,10 @@ void __init arch_post_acpi_subsys_init(void) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE)) + static_branch_enable(&arch_needs_tick_broadcast); + pr_info("System has AMD C1E erratum E400. Workaround enabled.\n"); } static int __init idle_setup(char *str) @@ -998,24 +986,14 @@ static int __init idle_setup(char *str) boot_option_idle_override = IDLE_POLL; cpu_idle_poll_ctrl(true); } else if (!strcmp(str, "halt")) { - /* - * When the boot option of idle=halt is added, halt is - * forced to be used for CPU idle. In such case CPU C2/C3 - * won't be used again. - * To continue to load the CPU idle driver, don't touch - * the boot_option_idle_override. - */ - static_call_update(x86_idle, default_idle); + /* 'idle=halt' HALT for idle. C-states are disabled. */ boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { - /* - * If the boot option of "idle=nomwait" is added, - * it means that mwait will be disabled for CPU C1/C2/C3 - * states. - */ + /* 'idle=nomwait' disables MWAIT for idle */ boot_option_idle_override = IDLE_NOMWAIT; - } else - return -1; + } else { + return -EINVAL; + } return 0; } @@ -1030,7 +1008,10 @@ unsigned long arch_align_stack(unsigned long sp) unsigned long arch_randomize_brk(struct mm_struct *mm) { - return randomize_page(mm->brk, 0x02000000); + if (mmap_is_ia32()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 708c87b88cc1..0917c7f25720 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -156,13 +156,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -209,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 33b268747bb7..226472332a70 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -56,6 +56,7 @@ #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> +#include <asm/fred.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -117,7 +118,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); - printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); @@ -138,7 +139,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, log_lvl, d3, d6, d7); } - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) + if (cr4 & X86_CR4_PKE) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } @@ -166,7 +167,29 @@ static noinstr unsigned long __rdgsbase_inactive(void) lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + /* + * SWAPGS is no longer needed thus NOT allowed with FRED because + * FRED transitions ensure that an operating system can _always_ + * operate with its own GS base address: + * - For events that occur in ring 3, FRED event delivery swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * - ERETU (the FRED transition that returns to ring 3) also swaps + * the GS base address with the IA32_KERNEL_GS_BASE MSR. + * + * And the operating system can still setup the GS segment for a + * user thread without the need of loading a user thread GS with: + * - Using LKGS, available with FRED, to modify other attributes + * of the GS segment without compromising its ability always to + * operate with its own GS base address. + * - Accessing the GS segment base address for a user thread as + * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. + * + * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE + * MSR instead of the GS segment’s descriptor cache. As such, the + * operating system never changes its runtime GS base address. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); @@ -191,7 +214,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); - if (!cpu_feature_enabled(X86_FEATURE_XENPV)) { + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); @@ -505,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, - unsigned int _cs, unsigned int _ss, unsigned int _ds) + u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); @@ -522,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(ds, _ds); load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - regs->cs = _cs; - regs->ss = _ss; - regs->flags = X86_EFLAGS_IF; + regs->ip = new_ip; + regs->sp = new_sp; + regs->csx = _cs; + regs->ssx = _ss; + /* + * Allow single-step trap and NMI when starting a new task, thus + * once the new task enters user space, single-step trap and NMI + * are both enabled immediately. + * + * Entering a new task is logically speaking a return from a + * system call (exec, fork, clone, etc.). As such, if ptrace + * enables single stepping a single step exception should be + * allowed to trigger immediately upon entering user space. + * This is not optional. + * + * NMI should *never* be disabled in user space. As such, this + * is an optional, opportunistic way to catch errors. + * + * Paranoia: High-order 48 bits above the lowest 16 bit SS are + * discarded by the legacy IRET instruction on all Intel, AMD, + * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, + * even when FRED is not enabled. But we choose the safer side + * to use these bits only when FRED is enabled. + */ + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + regs->fred_ss.swevent = true; + regs->fred_ss.nmi = true; + } + + regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void @@ -562,14 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; - struct fpu *prev_fpu = &prev->fpu; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(pcpu_hot.hardirq_stack_inuse)); - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_fpu, cpu); + if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -623,7 +671,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(pcpu_hot.current_task, next_p); raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(); + switch_fpu_finish(next_p); /* Reload sp0. */ update_task_stack(next_p); @@ -750,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #define LAM_U57_BITS 6 +static void enable_lam_func(void *__mm) +{ + struct mm_struct *mm = __mm; + unsigned long lam; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { + lam = mm_lam_cr3_mask(mm); + write_cr3(__read_cr3() | lam); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); + } +} + +static void mm_enable_lam(struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + + /* + * Even though the process must still be single-threaded at this + * point, kernel threads may be using the mm. IPI those kernel + * threads if they exist. + */ + on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); +} + static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) @@ -766,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) if (mmap_write_lock_killable(mm)) return -EINTR; + /* + * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from + * being enabled unless the process is single threaded: + */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } - if (!nr_bits) { - mmap_write_unlock(mm); - return -EINVAL; - } else if (nr_bits <= LAM_U57_BITS) { - mm->context.lam_cr3_mask = X86_CR3_LAM_U57; - mm->context.untag_mask = ~GENMASK(62, 57); - } else { + if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_lam_mode(mm); - set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + mm_enable_lam(mm); mmap_write_unlock(mm); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 830425e6d38e..dc1dd3f3e67f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,7 @@ #include <linux/delay.h> #include <linux/objtool.h> #include <linux/pgtable.h> +#include <linux/kexec.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -529,7 +530,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -599,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void) } #else static void emergency_reboot_disable_virtualization(void) { } -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void) void native_machine_shutdown(void) { + /* + * Call enc_kexec_begin() while all CPUs are still active and + * interrupts are enabled. This will allow all in-flight memory + * conversions to finish cleanly. + */ + if (kexec_in_progress) + x86_platform.guest.enc_kexec_begin(); + /* Stop the cpus and apics */ #ifdef CONFIG_X86_IO_APIC /* @@ -752,6 +761,9 @@ void native_machine_shutdown(void) #ifdef CONFIG_X86_64 x86_platform.iommu_shutdown(); #endif + + if (kexec_in_progress) + x86_platform.guest.enc_kexec_finish(); } static void __machine_emergency_restart(int emergency) @@ -796,7 +808,7 @@ struct machine_ops machine_ops __ro_after_init = { .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, .halt = native_machine_halt, -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP .crash_shutdown = native_machine_crash_shutdown, #endif }; @@ -826,7 +838,7 @@ void machine_halt(void) machine_ops.halt(); } -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { machine_ops.crash_shutdown(regs); @@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); + + if (smp_ops.stop_this_cpu) { + smp_ops.stop_this_cpu(); + BUG(); + } + /* Assume hlt works */ halt(); for (;;) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 56cab1bb25f5..b44d8863e57f 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -5,12 +5,15 @@ */ #include <linux/linkage.h> +#include <linux/stringify.h> +#include <asm/alternative.h> #include <asm/page_types.h> #include <asm/kexec.h> #include <asm/processor-flags.h> #include <asm/pgtable_types.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> +#include <asm/asm-offsets.h> /* * Must be relocatable PIC code callable as a C function, in particular @@ -21,33 +24,30 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) /* - * control_page + KEXEC_CONTROL_CODE_MAX_SIZE - * ~ control_page + PAGE_SIZE are used as data storage and stack for - * jumping back + * The .text..relocate_kernel and .data..relocate_kernel sections are copied + * into the control page, and the remainder of the page is used as the stack. */ -#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + .section .data..relocate_kernel,"a"; /* Minimal CPU state */ -#define RSP DATA(0x0) -#define CR0 DATA(0x8) -#define CR3 DATA(0x10) -#define CR4 DATA(0x18) - -/* other data */ -#define CP_PA_TABLE_PAGE DATA(0x20) -#define CP_PA_SWAP_PAGE DATA(0x28) -#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) - - .text - .align PAGE_SIZE +SYM_DATA_LOCAL(saved_rsp, .quad 0) +SYM_DATA_LOCAL(saved_cr0, .quad 0) +SYM_DATA_LOCAL(saved_cr3, .quad 0) +SYM_DATA_LOCAL(saved_cr4, .quad 0) + /* other data */ +SYM_DATA(kexec_va_control_page, .quad 0) +SYM_DATA(kexec_pa_table_page, .quad 0) +SYM_DATA(kexec_pa_swap_page, .quad 0) +SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) + + .section .text..relocate_kernel,"ax"; .code64 -SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* * %rdi indirection_page - * %rsi page_list + * %rsi pa_control_page * %rdx start address * %rcx preserve_context * %r8 host_mem_enc_active @@ -62,60 +62,57 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 - movq %rsp, RSP(%r11) - movq %cr0, %rax - movq %rax, CR0(%r11) - movq %cr3, %rax - movq %rax, CR3(%r11) - movq %cr4, %rax - movq %rax, CR4(%r11) - - /* Save CR4. Required to enable the right paging mode later. */ - movq %rax, %r13 - /* zero out flags, and disable interrupts */ pushq $0 popfq - /* Save SME active flag */ - movq %r8, %r12 + /* Switch to the identity mapped page tables */ + movq %cr3, %rax + movq kexec_pa_table_page(%rip), %r9 + movq %r9, %cr3 - /* - * get physical address of control page now - * this is impossible after page table switch - */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + /* Leave CR4 in %r13 to enable the right paging mode later. */ + movq %cr4, %r13 - /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + /* Disable global pages immediately to ensure this mapping is RWX */ + movq %r13, %r12 + andq $~(X86_CR4_PGE), %r12 + movq %r12, %cr4 - /* get physical address of swap page now */ - movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + /* Save %rsp and CRs. */ + movq %r13, saved_cr4(%rip) + movq %rsp, saved_rsp(%rip) + movq %rax, saved_cr3(%rip) + movq %cr0, %rax + movq %rax, saved_cr0(%rip) - /* save some information for jumping back */ - movq %r9, CP_PA_TABLE_PAGE(%r11) - movq %r10, CP_PA_SWAP_PAGE(%r11) - movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + /* save indirection list for jumping back */ + movq %rdi, pa_backup_pages_map(%rip) - /* Switch to the identity mapped page tables */ - movq %r9, %cr3 + /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + movq %rcx, %r11 /* setup a new stack at the end of the physical control page */ - lea PAGE_SIZE(%r8), %rsp + lea PAGE_SIZE(%rsi), %rsp /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %r8 - pushq %r8 - ANNOTATE_UNRET_SAFE - ret - int3 +0: addq $identity_mapped - 0b, %rsi + subq $__relocate_kernel_start - 0b, %rsi + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) UNWIND_HINT_END_OF_STACK - /* set return address to 0 if not preserving context */ - pushq $0 + /* + * %rdi indirection page + * %rdx start address + * %r8 host_mem_enc_active + * %r9 page table page + * %r11 preserve_context + * %r13 original CR4 when relocate_kernel() was invoked + */ + /* store the start address on the stack */ pushq %rdx @@ -145,16 +142,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * Set cr4 to a known state: * - physical address extension enabled * - 5-level paging, if it was enabled before + * - Machine check exception on TDX guest, if it was enabled before. + * Clearing MCE might not be allowed in TDX guests, depending on setup. + * + * Use R13 that contains the original CR4 value, read in relocate_kernel(). + * PAE is always set in the original CR4. */ - movl $X86_CR4_PAE, %eax - testq $X86_CR4_LA57, %r13 - jz 1f - orl $X86_CR4_LA57, %eax -1: - movq %rax, %cr4 - - jmp 1f -1: + andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST + movq %r13, %cr4 /* Flush the TLB (needed?) */ movq %r9, %cr3 @@ -164,12 +160,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. */ - testq %r12, %r12 - jz 1f + testq %r8, %r8 + jz .Lsme_off wbinvd -1: +.Lsme_off: - movq %rcx, %r11 call swap_pages /* @@ -181,13 +176,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 + testq %r11, %r11 /* preserve_context */ + jnz .Lrelocate + /* * set all of the registers to known values * leave %rsp alone */ - testq %r11, %r11 - jnz 1f xorl %eax, %eax xorl %ebx, %ebx xorl %ecx, %ecx @@ -208,22 +204,36 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ret int3 -1: +.Lrelocate: popq %rdx + + /* Use the swap page for the callee's stack */ + movq kexec_pa_swap_page(%rip), %r10 leaq PAGE_SIZE(%r10), %rsp + + /* push the existing entry point onto the callee's stack */ + pushq %rdx + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ - movq 0(%rsp), %rbp - leaq relocate_kernel(%rip), %r8 - movq CP_PA_SWAP_PAGE(%r8), %r10 - movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi - movq CP_PA_TABLE_PAGE(%r8), %rax + popq %rbp + movq kexec_pa_swap_page(%rip), %r10 + movq pa_backup_pages_map(%rip), %rdi + movq kexec_pa_table_page(%rip), %rax movq %rax, %cr3 + + /* Find start (and end) of this physical mapping of control page */ + leaq (%rip), %r8 + ANNOTATE_NOENDBR + andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp + movl $1, %r11d /* Ensure preserve_context flag is set */ call swap_pages - movq $virtual_mapped, %rax + movq kexec_va_control_page(%rip), %rax +0: addq $virtual_mapped - 0b, %rax + subq $__relocate_kernel_start - 0b, %rax pushq %rax ANNOTATE_UNRET_SAFE ret @@ -233,13 +243,21 @@ SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // RET target, above - movq RSP(%r8), %rsp - movq CR4(%r8), %rax + movq saved_rsp(%rip), %rsp + movq saved_cr4(%rip), %rax movq %rax, %cr4 - movq CR3(%r8), %rax - movq CR0(%r8), %r8 + movq saved_cr3(%rip), %rax + movq saved_cr0(%rip), %r8 movq %rax, %cr3 movq %r8, %cr0 + +#ifdef CONFIG_KEXEC_JUMP + /* Saved in save_processor_state. */ + movq $saved_context, %rax + lgdt saved_context_gdt_desc(%rax) +#endif + + /* relocate_kernel() returns the re-entry point for next time */ movq %rbp, %rax popf @@ -257,61 +275,69 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK - movq %rdi, %rcx /* Put the page_list in %rcx */ + /* + * %rdi indirection page + * %r11 preserve_context + */ + movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi xorl %esi, %esi - jmp 1f + jmp .Lstart /* Should start with an indirection record */ -0: /* top, read another word for the indirection page */ +.Lloop: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx -1: +.Lstart: testb $0x1, %cl /* is it a destination page? */ - jz 2f + jz .Lnotdest movq %rcx, %rdi andq $0xfffffffffffff000, %rdi - jmp 0b -2: + jmp .Lloop +.Lnotdest: testb $0x2, %cl /* is it an indirection page? */ - jz 2f + jz .Lnotind movq %rcx, %rbx andq $0xfffffffffffff000, %rbx - jmp 0b -2: + jmp .Lloop +.Lnotind: testb $0x4, %cl /* is it the done indicator? */ - jz 2f - jmp 3f -2: + jz .Lnotdone + jmp .Ldone +.Lnotdone: testb $0x8, %cl /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ + jz .Lloop /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi - movq %rdi, %rdx - movq %rsi, %rax + movq %rdi, %rdx /* Save destination page to %rdx */ + movq %rsi, %rax /* Save source page to %rax */ - movq %r10, %rdi + testq %r11, %r11 /* Only actually swap for ::preserve_context */ + jz .Lnoswap + + /* copy source page to swap page */ + movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx rep ; movsq + /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx rep ; movsq + /* copy swap page to destination page */ movq %rdx, %rdi - movq %r10, %rsi + movq kexec_pa_swap_page(%rip), %rsi +.Lnoswap: movl $512, %ecx rep ; movsq lea PAGE_SIZE(%rax), %rsi - jmp 0b -3: + jmp .Lloop +.Ldone: ANNOTATE_UNRET_SAFE ret int3 SYM_CODE_END(swap_pages) - - .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc -SYM_CODE_END(relocate_range); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 1309b9b05338..51a849a79c98 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -10,7 +10,6 @@ #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> -#include <asm/intel-mid.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 @@ -67,7 +66,7 @@ void mach_get_cmos_time(struct timespec64 *now) return; } - if (mc146818_get_time(&tm)) { + if (mc146818_get_time(&tm, 1000)) { pr_err("Unable to read current time from RTC\n"); now->tv_sec = now->tv_nsec = 0; return; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1526747bedf2..cebee310e200 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -7,9 +7,9 @@ */ #include <linux/acpi.h> #include <linux/console.h> +#include <linux/cpu.h> #include <linux/crash_dump.h> #include <linux/dma-map-ops.h> -#include <linux/dmi.h> #include <linux/efi.h> #include <linux/ima.h> #include <linux/init_ohci1394_dma.h> @@ -36,6 +36,7 @@ #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cacheinfo.h> +#include <asm/coco.h> #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -163,7 +164,8 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; #ifdef CONFIG_CMDLINE_BOOL -static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +bool builtin_cmdline_added __ro_after_init; #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -226,8 +228,6 @@ static void __init reserve_brk(void) _brk_start = 0; } -u64 relocated_ramdisk; - #ifdef CONFIG_BLK_DEV_INITRD static u64 __init get_ramdisk_image(void) @@ -259,9 +259,10 @@ static void __init relocate_initrd(void) u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); + int ret = 0; /* We need to move the initrd down into directly mapped mem */ - relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, + u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, PFN_PHYS(max_pfn_mapped)); if (!relocated_ramdisk) panic("Cannot find place for new RAMDISK of size %lld\n", @@ -272,7 +273,9 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + if (ret) + panic("Copy RAMDISK failed\n"); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", @@ -473,7 +476,7 @@ static void __init arch_reserve_crashkernel(void) bool high = false; int ret; - if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), @@ -755,6 +758,23 @@ void __init setup_arch(char **cmdline_p) boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif + builtin_cmdline_added = true; +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + /* * If we have OLPC OFW, we might end up relocating the fixmap due to * reserve_top(), so do this before touching the ioremap area. @@ -834,22 +854,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = __pa_symbol(__bss_start); bss_resource.end = __pa_symbol(__bss_stop)-1; -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -904,7 +908,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); reserve_ibft_region(); - dmi_setup(); + x86_init.resources.dmi_setup(); /* * VMware detection requires dmi to be available, so this @@ -972,10 +976,8 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); + /* Find and reserve MPTABLE area */ + x86_init.mpparse.find_mptable(); early_alloc_pgt_buf(); @@ -996,8 +998,8 @@ void __init setup_arch(char **cmdline_p) * memory size. */ mem_encrypt_setup_arch(); + cc_random_init(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); efi_mokvar_table_init(); @@ -1033,12 +1035,19 @@ void __init setup_arch(char **cmdline_p) * * Moreover, on machines with SandyBridge graphics or in setups that use * crashkernel the entire 1M is reserved anyway. + * + * Note the host kernel TDX also requires the first 1MB being reserved. */ x86_platform.realmode_reserve(); init_mem_mapping(); - idt_setup_early_pf(); + /* + * init_mem_mapping() relies on the early IDT page fault handling. + * Now either enable FRED or install the real page fault handler + * for 64-bit in the IDT. + */ + cpu_init_replace_early_idt(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1090,7 +1099,9 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); + /* Some platforms need the APIC registered for NUMA configuration */ early_acpi_boot_init(); + x86_init.mpparse.early_parse_smp_cfg(); x86_flattree_get_config(); @@ -1106,8 +1117,6 @@ void __init setup_arch(char **cmdline_p) */ arch_reserve_crashkernel(); - memblock_find_dma_reserve(); - if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1131,24 +1140,19 @@ void __init setup_arch(char **cmdline_p) early_quirks(); - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); - x86_dtb_init(); + topology_apply_cmdline_limits_early(); /* - * get boot-time SMP configuration: + * Parse SMP configuration. Try ACPI first and then the platform + * specific parser. */ - get_smp_config(); + acpi_boot_init(); + x86_init.mpparse.parse_smp_cfg(); - /* - * Systems w/o ACPI and mptables might not have it mapped the local - * APIC yet, but prefill_possible_map() might need to access it. - */ + /* Last opportunity to detect and map the local APIC */ init_apic_mappings(); - prefill_possible_map(); + topology_init_possible_cpus(); init_cpu_to_node(); init_gi_nodes(); @@ -1222,3 +1226,10 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +#ifdef CONFIG_HOTPLUG_CPU +bool arch_cpu_is_hotpluggable(int cpu) +{ + return cpu > 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2c97bf7b56ae..b30d6e180df7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr) static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 - struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu), - 0xFFFFF); + struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32, + per_cpu_offset(cpu), 0xFFFFF); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S); #endif diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c deleted file mode 100644 index ccb0915e84e1..000000000000 --- a/arch/x86/kernel/sev-shared.c +++ /dev/null @@ -1,1172 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel <jroedel@suse.de> - * - * This file is not compiled stand-alone. It contains code shared - * between the pre-decompression boot code and the running Linux kernel - * and is included directly into both code-bases. - */ - -#ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) -#else -#undef WARN -#define WARN(condition, format...) (!!(condition)) -#endif - -/* I/O parameters for CPUID-related helpers */ -struct cpuid_leaf { - u32 fn; - u32 subfn; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; -}; - -/* - * Individual entries of the SNP CPUID table, as defined by the SNP - * Firmware ABI, Revision 0.9, Section 7.1, Table 14. - */ -struct snp_cpuid_fn { - u32 eax_in; - u32 ecx_in; - u64 xcr0_in; - u64 xss_in; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u64 __reserved; -} __packed; - -/* - * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, - * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit - * of 64 entries per CPUID table. - */ -#define SNP_CPUID_COUNT_MAX 64 - -struct snp_cpuid_table { - u32 count; - u32 __reserved1; - u64 __reserved2; - struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; -} __packed; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -static u16 ghcb_version __ro_after_init; - -/* Copy of the SNP firmware's CPUID page. */ -static struct snp_cpuid_table cpuid_table_copy __ro_after_init; - -/* - * These will be initialized based on CPUID table so that non-present - * all-zero leaves (for sparse tables) can be differentiated from - * invalid/out-of-range leaves. This is needed since all-zero leaves - * still need to be post-processed. - */ -static u32 cpuid_std_range_max __ro_after_init; -static u32 cpuid_hyp_range_max __ro_after_init; -static u32 cpuid_ext_range_max __ro_after_init; - -static bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} - -static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) -{ - u64 val = GHCB_MSR_TERM_REQ; - - /* Tell the hypervisor what went wrong. */ - val |= GHCB_SEV_TERM_REASON(set, reason); - - /* Request Guest Termination from Hypvervisor */ - sev_es_wr_ghcb_msr(val); - VMGEXIT(); - - while (true) - asm volatile("hlt\n" : : : "memory"); -} - -/* - * The hypervisor features are available from GHCB version 2 onward. - */ -static u64 get_hv_features(void) -{ - u64 val; - - if (ghcb_version < 2) - return 0; - - sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) - return 0; - - return GHCB_MSR_HV_FT_RESP_VAL(val); -} - -static void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -static bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - ghcb->save.sw_exit_code = 0; - __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - -static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) -{ - u64 val; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - return -EIO; - - *reg = (val >> 32); - - return 0; -} - -static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) -{ - int ret; - - /* - * MSR protocol does not support fetching non-zero subfunctions, but is - * sufficient to handle current early-boot cases. Should that change, - * make sure to report an error rather than ignoring the index and - * grabbing random values. If this issue arises in the future, handling - * can be added here to use GHCB-page protocol for cases that occur late - * enough in boot that GHCB page is available. - */ - if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) - return -EINVAL; - - ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); - - return ret; -} - -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} - -/* - * This may be called early while still running on the initial identity - * mapping. Use RIP-relative addressing to obtain the correct address - * while running with the initial identity mapping as well as the - * switch-over to kernel virtual addresses later. - */ -static const struct snp_cpuid_table *snp_cpuid_get_table(void) -{ - void *ptr; - - asm ("lea cpuid_table_copy(%%rip), %0" - : "=r" (ptr) - : "p" (&cpuid_table_copy)); - - return ptr; -} - -/* - * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of - * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 - * and 1 based on the corresponding features enabled by a particular - * combination of XCR0 and XSS registers so that a guest can look up the - * version corresponding to the features currently enabled in its XCR0/XSS - * registers. The only values that differ between these versions/table - * entries is the enabled XSAVE area size advertised via EBX. - * - * While hypervisors may choose to make use of this support, it is more - * robust/secure for a guest to simply find the entry corresponding to the - * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the - * XSAVE area size using subfunctions 2 through 64, as documented in APM - * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. - * - * Since base/legacy XSAVE area size is documented as 0x240, use that value - * directly rather than relying on the base size in the CPUID table. - * - * Return: XSAVE area size on success, 0 otherwise. - */ -static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - u64 xfeatures_found = 0; - u32 xsave_size = 0x240; - int i; - - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; - - if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) - continue; - if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) - continue; - if (xfeatures_found & (BIT_ULL(e->ecx_in))) - continue; - - xfeatures_found |= (BIT_ULL(e->ecx_in)); - - if (compacted) - xsave_size += e->eax; - else - xsave_size = max(xsave_size, e->eax + e->ebx); - } - - /* - * Either the guest set unsupported XCR0/XSS bits, or the corresponding - * entries in the CPUID table were not present. This is not a valid - * state to be in. - */ - if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) - return 0; - - return xsave_size; -} - -static bool -snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - int i; - - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; - - if (e->eax_in != leaf->fn) - continue; - - if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) - continue; - - /* - * For 0xD subfunctions 0 and 1, only use the entry corresponding - * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). - * See the comments above snp_cpuid_calc_xsave_size() for more - * details. - */ - if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) - if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) - continue; - - leaf->eax = e->eax; - leaf->ebx = e->ebx; - leaf->ecx = e->ecx; - leaf->edx = e->edx; - - return true; - } - - return false; -} - -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - if (sev_cpuid_hv(ghcb, ctxt, leaf)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); -} - -static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) -{ - struct cpuid_leaf leaf_hv = *leaf; - - switch (leaf->fn) { - case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* initial APIC ID */ - leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); - /* APIC enabled bit */ - leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); - - /* OSXSAVE enabled bit */ - if (native_read_cr4() & X86_CR4_OSXSAVE) - leaf->ecx |= BIT(27); - break; - case 0x7: - /* OSPKE enabled bit */ - leaf->ecx &= ~BIT(4); - if (native_read_cr4() & X86_CR4_PKE) - leaf->ecx |= BIT(4); - break; - case 0xB: - leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* extended APIC ID */ - leaf->edx = leaf_hv.edx; - break; - case 0xD: { - bool compacted = false; - u64 xcr0 = 1, xss = 0; - u32 xsave_size; - - if (leaf->subfn != 0 && leaf->subfn != 1) - return 0; - - if (native_read_cr4() & X86_CR4_OSXSAVE) - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if (leaf->subfn == 1) { - /* Get XSS value if XSAVES is enabled. */ - if (leaf->eax & BIT(3)) { - unsigned long lo, hi; - - asm volatile("rdmsr" : "=a" (lo), "=d" (hi) - : "c" (MSR_IA32_XSS)); - xss = (hi << 32) | lo; - } - - /* - * The PPR and APM aren't clear on what size should be - * encoded in 0xD:0x1:EBX when compaction is not enabled - * by either XSAVEC (feature bit 1) or XSAVES (feature - * bit 3) since SNP-capable hardware has these feature - * bits fixed as 1. KVM sets it to 0 in this case, but - * to avoid this becoming an issue it's safer to simply - * treat this as unsupported for SNP guests. - */ - if (!(leaf->eax & (BIT(1) | BIT(3)))) - return -EINVAL; - - compacted = true; - } - - xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); - if (!xsave_size) - return -EINVAL; - - leaf->ebx = xsave_size; - } - break; - case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* extended APIC ID */ - leaf->eax = leaf_hv.eax; - /* compute ID */ - leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); - /* node ID */ - leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); - break; - default: - /* No fix-ups needed, use values as-is. */ - break; - } - - return 0; -} - -/* - * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value - * should be treated as fatal by caller. - */ -static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - - if (!cpuid_table->count) - return -EOPNOTSUPP; - - if (!snp_cpuid_get_validated_func(leaf)) { - /* - * Some hypervisors will avoid keeping track of CPUID entries - * where all values are zero, since they can be handled the - * same as out-of-range values (all-zero). This is useful here - * as well as it allows virtually all guest configurations to - * work using a single SNP CPUID table. - * - * To allow for this, there is a need to distinguish between - * out-of-range entries and in-range zero entries, since the - * CPUID table entries are only a template that may need to be - * augmented with additional values for things like - * CPU-specific information during post-processing. So if it's - * not in the table, set the values to zero. Then, if they are - * within a valid CPUID range, proceed with post-processing - * using zeros as the initial values. Otherwise, skip - * post-processing and just return zeros immediately. - */ - leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; - - /* Skip post-processing for out-of-range zero leafs. */ - if (!(leaf->fn <= cpuid_std_range_max || - (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || - (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) - return 0; - } - - return snp_cpuid_postprocess(ghcb, ctxt, leaf); -} - -/* - * Boot VC Handler - This is the first VC handler during boot, there is no GHCB - * page yet, so it only supports the MSR based communication with the - * hypervisor and only the CPUID exit-code. - */ -void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) -{ - unsigned int subfn = lower_bits(regs->cx, 32); - unsigned int fn = lower_bits(regs->ax, 32); - struct cpuid_leaf leaf; - int ret; - - /* Only CPUID is supported via MSR protocol */ - if (exit_code != SVM_EXIT_CPUID) - goto fail; - - leaf.fn = fn; - leaf.subfn = subfn; - - ret = snp_cpuid(NULL, NULL, &leaf); - if (!ret) - goto cpuid_done; - - if (ret != -EOPNOTSUPP) - goto fail; - - if (__sev_cpuid_hv_msr(&leaf)) - goto fail; - -cpuid_done: - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - - /* - * This is a VC handler and the #VC is only raised when SEV-ES is - * active, which means SEV must be active too. Do sanity checks on the - * CPUID results to make sure the hypervisor does not trick the kernel - * into the no-sev path. This could map sensitive data unencrypted and - * make it accessible to the hypervisor. - * - * In particular, check for: - * - Availability of CPUID leaf 0x8000001f - * - SEV CPUID bit. - * - * The hypervisor might still report the wrong C-bit position, but this - * can't be checked here. - */ - - if (fn == 0x80000000 && (regs->ax < 0x8000001f)) - /* SEV leaf check */ - goto fail; - else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) - /* SEV bit */ - goto fail; - - /* Skip over the CPUID two-byte opcode */ - regs->ip += 2; - - return; - -fail: - /* Terminate the guest */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, - unsigned long address, - bool write) -{ - if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_USER; - ctxt->fi.cr2 = address; - if (write) - ctxt->fi.error_code |= X86_PF_WRITE; - - return ES_EXCEPTION; - } - - return ES_OK; -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - unsigned long address = (unsigned long)src; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, false); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - unsigned long address = (unsigned long)dst; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, true); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - size_t size; - u64 port; - - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - port = ctxt->regs->dx & 0xffff; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - port = ctxt->regs->dx & 0xffff; - break; - - default: - return ES_DECODE_FAILED; - } - - *exitinfo |= port << 16; - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - size = 1; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - size = (insn->opnd_bytes == 2) ? 2 : 4; - } - - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return vc_ioio_check(ctxt, (u16)port, size); -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - struct cpuid_leaf leaf; - int ret; - - leaf.fn = regs->ax; - leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); - if (!ret) { - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - int snp_cpuid_ret; - - snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); - if (!snp_cpuid_ret) - return ES_OK; - if (snp_cpuid_ret != -EOPNOTSUPP) - return ES_VMM_ERROR; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} - -struct cc_setup_data { - struct setup_data header; - u32 cc_blob_address; -}; - -/* - * Search for a Confidential Computing blob passed in as a setup_data entry - * via the Linux Boot Protocol. - */ -static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) -{ - struct cc_setup_data *sd = NULL; - struct setup_data *hdr; - - hdr = (struct setup_data *)bp->hdr.setup_data; - - while (hdr) { - if (hdr->type == SETUP_CC_BLOB) { - sd = (struct cc_setup_data *)hdr; - return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; - } - hdr = (struct setup_data *)hdr->next; - } - - return NULL; -} - -/* - * Initialize the kernel's copy of the SNP CPUID table, and set up the - * pointer that will be used to access it. - * - * Maintaining a direct mapping of the SNP CPUID table used by firmware would - * be possible as an alternative, but the approach is brittle since the - * mapping needs to be updated in sync with all the changes to virtual memory - * layout and related mapping facilities throughout the boot process. - */ -static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info) -{ - const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; - int i; - - if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; - if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table = snp_cpuid_get_table(); - memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); - - /* Initialize CPUID ranges for range-checking. */ - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; - - if (fn->eax_in == 0x0) - cpuid_std_range_max = fn->eax; - else if (fn->eax_in == 0x40000000) - cpuid_hyp_range_max = fn->eax; - else if (fn->eax_in == 0x80000000) - cpuid_ext_range_max = fn->eax; - } -} - -static void pvalidate_pages(struct snp_psc_desc *desc) -{ - struct psc_entry *e; - unsigned long vaddr; - unsigned int size; - unsigned int i; - bool validate; - int rc; - - for (i = 0; i <= desc->hdr.end_entry; i++) { - e = &desc->entries[i]; - - vaddr = (unsigned long)pfn_to_kaddr(e->gfn); - size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - validate = e->operation == SNP_PAGE_STATE_PRIVATE; - - rc = pvalidate(vaddr, size, validate); - if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { - unsigned long vaddr_end = vaddr + PMD_SIZE; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (rc) - break; - } - } - - if (rc) { - WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - } - } -} - -static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) -{ - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; - struct es_em_ctxt ctxt; - - vc_ghcb_invalidate(ghcb); - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - return ret; -} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c deleted file mode 100644 index c67285824e82..000000000000 --- a/arch/x86/kernel/sev.c +++ /dev/null @@ -1,2264 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * AMD Memory Encryption Support - * - * Copyright (C) 2019 SUSE - * - * Author: Joerg Roedel <jroedel@suse.de> - */ - -#define pr_fmt(fmt) "SEV: " fmt - -#include <linux/sched/debug.h> /* For show_regs() */ -#include <linux/percpu-defs.h> -#include <linux/cc_platform.h> -#include <linux/printk.h> -#include <linux/mm_types.h> -#include <linux/set_memory.h> -#include <linux/memblock.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/cpumask.h> -#include <linux/efi.h> -#include <linux/platform_device.h> -#include <linux/io.h> -#include <linux/psp-sev.h> -#include <uapi/linux/sev-guest.h> - -#include <asm/cpu_entry_area.h> -#include <asm/stacktrace.h> -#include <asm/sev.h> -#include <asm/insn-eval.h> -#include <asm/fpu/xcr.h> -#include <asm/processor.h> -#include <asm/realmode.h> -#include <asm/setup.h> -#include <asm/traps.h> -#include <asm/svm.h> -#include <asm/smp.h> -#include <asm/cpu.h> -#include <asm/apic.h> -#include <asm/cpuid.h> -#include <asm/cmdline.h> - -#define DR7_RESET_VALUE 0x400 - -/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ -#define AP_INIT_CS_LIMIT 0xffff -#define AP_INIT_DS_LIMIT 0xffff -#define AP_INIT_LDTR_LIMIT 0xffff -#define AP_INIT_GDTR_LIMIT 0xffff -#define AP_INIT_IDTR_LIMIT 0xffff -#define AP_INIT_TR_LIMIT 0xffff -#define AP_INIT_RFLAGS_DEFAULT 0x2 -#define AP_INIT_DR6_DEFAULT 0xffff0ff0 -#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL -#define AP_INIT_XCR0_DEFAULT 0x1 -#define AP_INIT_X87_FTW_DEFAULT 0x5555 -#define AP_INIT_X87_FCW_DEFAULT 0x0040 -#define AP_INIT_CR0_DEFAULT 0x60000010 -#define AP_INIT_MXCSR_DEFAULT 0x1f80 - -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -static u64 sev_hv_features __ro_after_init; - -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); - -struct sev_config { - __u64 debug : 1, - - /* - * A flag used by __set_pages_state() that indicates when the - * per-CPU GHCB has been created and registered and thus can be - * used by the BSP instead of the early boot GHCB. - * - * For APs, the per-CPU GHCB is created before they are started - * and registered upon startup, so this flag can be used globally - * for the BSP and APs. - */ - ghcbs_initialized : 1, - - __reserved : 62; -}; - -static struct sev_config sev_cfg __read_mostly; - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} - -/* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. - */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} - -void noinstr __sev_es_ist_exit(void) -{ - unsigned long ist; - - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; - - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); -} - -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} - -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = (u32)(val); - high = (u32)(val >> 32); - - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} - -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); -} - -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int insn_bytes; - - insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (insn_bytes == 0) { - /* Nothing could be copied */ - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } else if (insn_bytes == -EINVAL) { - /* Effective RIP could not be calculated */ - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - ctxt->fi.cr2 = 0; - return ES_EXCEPTION; - } - - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) - return ES_DECODE_FAILED; - - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; -} - -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res, ret; - - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; - - /* - * This function uses __put_user() independent of whether kernel or user - * memory is accessed. This works fine because __put_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __put_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_to_user() here because - * vc_write_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *target = (u8 __user *)dst; - - memcpy(&d1, buf, 1); - if (__put_user(d1, target)) - goto fault; - break; - } - case 2: { - u16 d2; - u16 __user *target = (u16 __user *)dst; - - memcpy(&d2, buf, 2); - if (__put_user(d2, target)) - goto fault; - break; - } - case 4: { - u32 d4; - u32 __user *target = (u32 __user *)dst; - - memcpy(&d4, buf, 4); - if (__put_user(d4, target)) - goto fault; - break; - } - case 8: { - u64 d8; - u64 __user *target = (u64 __user *)dst; - - memcpy(&d8, buf, 8); - if (__put_user(d8, target)) - goto fault; - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT; - - /* - * This function uses __get_user() independent of whether kernel or user - * memory is accessed. This works fine because __get_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __get_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_from_user() here because - * vc_read_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *s = (u8 __user *)src; - - if (__get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - } - case 2: { - u16 d2; - u16 __user *s = (u16 __user *)src; - - if (__get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - } - case 4: { - u32 d4; - u32 __user *s = (u32 __user *)src; - - if (__get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - } - case 8: { - u64 d8; - u64 __user *s = (u64 __user *)src; - if (__get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; - - return ES_EXCEPTION; -} - -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; - - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; - - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; - } - - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; -} - -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) -{ - BUG_ON(size > 4); - - if (user_mode(ctxt->regs)) { - struct thread_struct *t = ¤t->thread; - struct io_bitmap *iobm = t->io_bitmap; - size_t idx; - - if (!iobm) - goto fault; - - for (idx = port; idx < port + size; ++idx) { - if (test_bit(idx, iobm->bitmap)) - goto fault; - } - } - - return ES_OK; - -fault: - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - - return ES_EXCEPTION; -} - -/* Include code shared with pre-decompression boot stage */ -#include "sev-shared.c" - -static noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); - - __sev_put_ghcb(&state); -} - -static u64 __init get_secrets_page(void) -{ - u64 pa_data = boot_params.cc_blob_address; - struct cc_blob_sev_info info; - void *map; - - /* - * The CC blob contains the address of the secrets page, check if the - * blob is present. - */ - if (!pa_data) - return 0; - - map = early_memremap(pa_data, sizeof(info)); - if (!map) { - pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); - return 0; - } - memcpy(&info, map, sizeof(info)); - early_memunmap(map, sizeof(info)); - - /* smoke-test the secrets page passed */ - if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) - return 0; - - return info.secrets_phys; -} - -static u64 __init get_snp_jump_table_addr(void) -{ - struct snp_secrets_page_layout *layout; - void __iomem *mem; - u64 pa, addr; - - pa = get_secrets_page(); - if (!pa) - return 0; - - mem = ioremap_encrypted(pa, PAGE_SIZE); - if (!mem) { - pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); - return 0; - } - - layout = (__force struct snp_secrets_page_layout *)mem; - - addr = layout->os_area.ap_jump_table_pa; - iounmap(mem); - - return addr; -} - -static u64 __init get_jump_table_addr(void) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return get_snp_jump_table_addr(); - - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - return ret; -} - -static void early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) -{ - unsigned long paddr_end; - u64 val; - int ret; - - vaddr = vaddr & PAGE_MASK; - - paddr = paddr & PAGE_MASK; - paddr_end = paddr + (npages << PAGE_SHIFT); - - while (paddr < paddr_end) { - if (op == SNP_PAGE_STATE_SHARED) { - /* Page validation must be rescinded before changing to shared */ - ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); - if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) - goto e_term; - } - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, - "Wrong PSC response code: 0x%x\n", - (unsigned int)GHCB_RESP_CODE(val))) - goto e_term; - - if (WARN(GHCB_MSR_PSC_RESP_VAL(val), - "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", - op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", - paddr, GHCB_MSR_PSC_RESP_VAL(val))) - goto e_term; - - if (op == SNP_PAGE_STATE_PRIVATE) { - /* Page validation must be performed after changing to private */ - ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); - if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) - goto e_term; - } - - vaddr += PAGE_SIZE; - paddr += PAGE_SIZE; - } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); -} - -void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* - * Ask the hypervisor to mark the memory pages as private in the RMP - * table. - */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); -} - -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) -{ - unsigned long vaddr, npages; - - vaddr = (unsigned long)__va(paddr); - npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; - - if (op == SNP_PAGE_STATE_PRIVATE) - early_snp_set_memory_private(vaddr, paddr, npages); - else if (op == SNP_PAGE_STATE_SHARED) - early_snp_set_memory_shared(vaddr, paddr, npages); - else - WARN(1, "invalid memory op %d\n", op); -} - -static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, - unsigned long vaddr_end, int op) -{ - struct ghcb_state state; - bool use_large_entry; - struct psc_hdr *hdr; - struct psc_entry *e; - unsigned long flags; - unsigned long pfn; - struct ghcb *ghcb; - int i; - - hdr = &data->hdr; - e = data->entries; - - memset(data, 0, sizeof(*data)); - i = 0; - - while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { - hdr->end_entry = i; - - if (is_vmalloc_addr((void *)vaddr)) { - pfn = vmalloc_to_pfn((void *)vaddr); - use_large_entry = false; - } else { - pfn = __pa(vaddr) >> PAGE_SHIFT; - use_large_entry = true; - } - - e->gfn = pfn; - e->operation = op; - - if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && - (vaddr_end - vaddr) >= PMD_SIZE) { - e->pagesize = RMP_PG_SIZE_2M; - vaddr += PMD_SIZE; - } else { - e->pagesize = RMP_PG_SIZE_4K; - vaddr += PAGE_SIZE; - } - - e++; - i++; - } - - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_pages(data); - - local_irq_save(flags); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else - ghcb = boot_ghcb; - - /* Invoke the hypervisor to perform the page state changes */ - if (!ghcb || vmgexit_psc(ghcb, data)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_pages(data); - - return vaddr; -} - -static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) -{ - struct snp_psc_desc desc; - unsigned long vaddr_end; - - /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); - - vaddr = vaddr & PAGE_MASK; - vaddr_end = vaddr + (npages << PAGE_SHIFT); - - while (vaddr < vaddr_end) - vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); -} - -void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); -} - -void snp_set_memory_private(unsigned long vaddr, unsigned long npages) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void snp_accept_memory(phys_addr_t start, phys_addr_t end) -{ - unsigned long vaddr, npages; - - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - vaddr = (unsigned long)__va(start); - npages = (end - start) >> PAGE_SHIFT; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -static int snp_set_vmsa(void *va, bool vmsa) -{ - u64 attrs; - - /* - * Running at VMPL0 allows the kernel to change the VMSA bit for a page - * using the RMPADJUST instruction. However, for the instruction to - * succeed it must target the permissions of a lesser privileged - * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST - * instruction in the AMD64 APM Volume 3). - */ - attrs = 1; - if (vmsa) - attrs |= RMPADJUST_VMSA_PAGE_BIT; - - return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); -} - -#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) -#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) -#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) - -#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) -#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) - -static void *snp_alloc_vmsa_page(void) -{ - struct page *p; - - /* - * Allocate VMSA page to work around the SNP erratum where the CPU will - * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) - * collides with the RMP entry of VMSA page. The recommended workaround - * is to not use a large page. - * - * Allocate an 8k page which is also 8k-aligned. - */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); - if (!p) - return NULL; - - split_page(p, 1); - - /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ - __free_page(p); - - return page_address(p + 1); -} - -static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) -{ - int err; - - err = snp_set_vmsa(vmsa, false); - if (err) - pr_err("clear VMSA page failed (%u), leaking page\n", err); - else - free_page((unsigned long)vmsa); -} - -static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) -{ - struct sev_es_save_area *cur_vmsa, *vmsa; - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u8 sipi_vector; - int cpu, ret; - u64 cr4; - - /* - * The hypervisor SNP feature support check has happened earlier, just check - * the AP_CREATION one here. - */ - if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) - return -EOPNOTSUPP; - - /* - * Verify the desired start IP against the known trampoline start IP - * to catch any future new trampolines that may be introduced that - * would require a new protected guest entry point. - */ - if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, - "Unsupported SNP start_ip: %lx\n", start_ip)) - return -EINVAL; - - /* Override start_ip with known protected guest start IP */ - start_ip = real_mode_header->sev_es_trampoline_start; - - /* Find the logical CPU for the APIC ID */ - for_each_present_cpu(cpu) { - if (arch_match_cpu_phys_id(cpu, apic_id)) - break; - } - if (cpu >= nr_cpu_ids) - return -EINVAL; - - cur_vmsa = per_cpu(sev_vmsa, cpu); - - /* - * A new VMSA is created each time because there is no guarantee that - * the current VMSA is the kernels or that the vCPU is not running. If - * an attempt was done to use the current VMSA with a running vCPU, a - * #VMEXIT of that vCPU would wipe out all of the settings being done - * here. - */ - vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); - if (!vmsa) - return -ENOMEM; - - /* CR4 should maintain the MCE value */ - cr4 = native_read_cr4() & X86_CR4_MCE; - - /* Set the CS value based on the start_ip converted to a SIPI vector */ - sipi_vector = (start_ip >> 12); - vmsa->cs.base = sipi_vector << 12; - vmsa->cs.limit = AP_INIT_CS_LIMIT; - vmsa->cs.attrib = INIT_CS_ATTRIBS; - vmsa->cs.selector = sipi_vector << 8; - - /* Set the RIP value based on start_ip */ - vmsa->rip = start_ip & 0xfff; - - /* Set AP INIT defaults as documented in the APM */ - vmsa->ds.limit = AP_INIT_DS_LIMIT; - vmsa->ds.attrib = INIT_DS_ATTRIBS; - vmsa->es = vmsa->ds; - vmsa->fs = vmsa->ds; - vmsa->gs = vmsa->ds; - vmsa->ss = vmsa->ds; - - vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; - vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; - vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; - vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; - vmsa->tr.limit = AP_INIT_TR_LIMIT; - vmsa->tr.attrib = INIT_TR_ATTRIBS; - - vmsa->cr4 = cr4; - vmsa->cr0 = AP_INIT_CR0_DEFAULT; - vmsa->dr7 = DR7_RESET_VALUE; - vmsa->dr6 = AP_INIT_DR6_DEFAULT; - vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; - vmsa->g_pat = AP_INIT_GPAT_DEFAULT; - vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; - vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; - vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; - vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; - - /* SVME must be set. */ - vmsa->efer = EFER_SVME; - - /* - * Set the SNP-specific fields for this VMSA: - * VMPL level - * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) - */ - vmsa->vmpl = 0; - vmsa->sev_features = sev_status >> 2; - - /* Switch the page over to a VMSA page now that it is initialized */ - ret = snp_set_vmsa(vmsa, true); - if (ret) { - pr_err("set VMSA page failed (%u)\n", ret); - free_page((unsigned long)vmsa); - - return -EINVAL; - } - - /* Issue VMGEXIT AP Creation NAE event */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_rax(ghcb, vmsa->sev_features); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); - ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || - lower_32_bits(ghcb->save.sw_exit_info_1)) { - pr_err("SNP AP Creation error\n"); - ret = -EINVAL; - } - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Perform cleanup if there was an error */ - if (ret) { - snp_cleanup_vmsa(vmsa); - vmsa = NULL; - } - - /* Free up any previous VMSA page */ - if (cur_vmsa) - snp_cleanup_vmsa(cur_vmsa); - - /* Record the current VMSA page */ - per_cpu(sev_vmsa, cpu) = vmsa; - - return ret; -} - -void __init snp_set_wakeup_secondary_cpu(void) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - /* - * Always set this override if SNP is enabled. This makes it the - * required method to start APs under SNP. If the hypervisor does - * not support AP creation, then no APs will be started. - */ - apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); -} - -int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) -{ - u16 startup_cs, startup_ip; - phys_addr_t jump_table_pa; - u64 jump_table_addr; - u16 __iomem *jump_table; - - jump_table_addr = get_jump_table_addr(); - - /* On UP guests there is no jump table so this is not a failure */ - if (!jump_table_addr) - return 0; - - /* Check if AP Jump Table is page-aligned */ - if (jump_table_addr & ~PAGE_MASK) - return -EINVAL; - - jump_table_pa = jump_table_addr & PAGE_MASK; - - startup_cs = (u16)(rmh->trampoline_start >> 4); - startup_ip = (u16)(rmh->sev_es_trampoline_start - - rmh->trampoline_start); - - jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); - if (!jump_table) - return -EIO; - - writew(startup_ip, &jump_table[0]); - writew(startup_cs, &jump_table[1]); - - iounmap(jump_table); - - return 0; -} - -/* - * This is needed by the OVMF UEFI firmware which will use whatever it finds in - * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu - * runtime GHCBs used by the kernel are also mapped in the EFI page-table. - */ -int __init sev_es_efi_map_ghcbs(pgd_t *pgd) -{ - struct sev_es_runtime_data *data; - unsigned long address, pflags; - int cpu; - u64 pfn; - - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return 0; - - pflags = _PAGE_NX | _PAGE_RW; - - for_each_possible_cpu(cpu) { - data = per_cpu(runtime_data, cpu); - - address = __pa(&data->ghcb_page); - pfn = address >> PAGE_SHIFT; - - if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) - return 1; - } - - return 0; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - u64 exit_info_1; - - /* Is it a WRMSR? */ - exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; - - ghcb_set_rcx(ghcb, regs->cx); - if (exit_info_1) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); - - if ((ret == ES_OK) && (!exit_info_1)) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - -static void snp_register_per_cpu_ghcb(void) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - snp_register_ghcb_early(__pa(ghcb)); -} - -void setup_ghcb(void) -{ - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return; - - /* - * Check whether the runtime #VC exception handler is active. It uses - * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). - * - * If SNP is active, register the per-CPU GHCB page so that the runtime - * exception handler can use it. - */ - if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - snp_register_per_cpu_ghcb(); - - sev_cfg.ghcbs_initialized = true; - - return; - } - - /* - * Make sure the hypervisor talks a supported protocol. - * This gets called only in the BSP boot phase. - */ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* - * Clear the boot_ghcb. The first exception comes in before the bss - * section is cleared. - */ - memset(&boot_ghcb_page, 0, PAGE_SIZE); - - /* Alright - Make the boot-ghcb public */ - boot_ghcb = &boot_ghcb_page; - - /* SNP guest requires that GHCB GPA must be registered. */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - snp_register_ghcb_early(__pa(&boot_ghcb_page)); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void sev_es_ap_hlt_loop(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = __sev_get_ghcb(&state); - - while (true) { - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - /* Wakeup signal? */ - if (ghcb_sw_exit_info_2_is_valid(ghcb) && - ghcb->save.sw_exit_info_2) - break; - } - - __sev_put_ghcb(&state); -} - -/* - * Play_dead handler when running under SEV-ES. This is needed because - * the hypervisor can't deliver an SIPI request to restart the AP. - * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the - * hypervisor wakes it up again. - */ -static void sev_es_play_dead(void) -{ - play_dead_common(); - - /* IRQs now disabled */ - - sev_es_ap_hlt_loop(); - - /* - * If we get here, the VCPU was woken up again. Jump to CPU - * startup code to get it back online. - */ - soft_restart_cpu(); -} -#else /* CONFIG_HOTPLUG_CPU */ -#define sev_es_play_dead native_play_dead -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_SMP -static void __init sev_es_setup_play_dead(void) -{ - smp_ops.play_dead = sev_es_play_dead; -} -#else -static inline void sev_es_setup_play_dead(void) { } -#endif - -static void __init alloc_runtime_data(int cpu) -{ - struct sev_es_runtime_data *data; - - data = memblock_alloc(sizeof(*data), PAGE_SIZE); - if (!data) - panic("Can't allocate SEV-ES runtime data"); - - per_cpu(runtime_data, cpu) = data; -} - -static void __init init_ghcb(int cpu) -{ - struct sev_es_runtime_data *data; - int err; - - data = per_cpu(runtime_data, cpu); - - err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, - sizeof(data->ghcb_page)); - if (err) - panic("Can't map GHCBs unencrypted"); - - memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); - - data->ghcb_active = false; - data->backup_ghcb_active = false; -} - -void __init sev_es_init_vc_handling(void) -{ - int cpu; - - BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return; - - if (!sev_es_check_cpu_features()) - panic("SEV-ES CPU Features missing"); - - /* - * SNP is supported in v2 of the GHCB spec which mandates support for HV - * features. - */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { - sev_hv_features = get_hv_features(); - - if (!(sev_hv_features & GHCB_HV_FT_SNP)) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); - } - - /* Initialize per-cpu GHCB pages */ - for_each_possible_cpu(cpu) { - alloc_runtime_data(cpu); - init_ghcb(cpu); - } - - sev_es_setup_play_dead(); - - /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; -} - -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - enum insn_mmio_type mmio; - unsigned int bytes = 0; - enum es_result ret; - u8 sign_byte; - long *reg_data; - - mmio = insn_decode_mmio(insn, &bytes); - if (mmio == INSN_MMIO_DECODE_FAILED) - return ES_DECODE_FAILED; - - if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { - reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); - if (!reg_data) - return ES_DECODE_FAILED; - } - - if (user_mode(ctxt->regs)) - return ES_UNSUPPORTED; - - switch (mmio) { - case INSN_MMIO_WRITE: - memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_WRITE_IMM: - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_READ: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_ZERO_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - memset(reg_data, 0, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_SIGN_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - - /* Sign extend based on operand size */ - memset(reg_data, sign_byte, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_MOVS: - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - default: - ret = ES_UNSUPPORTED; - break; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; - - ctxt->regs->orig_ax = ctxt->fi.error_code; - - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_PF: - write_cr2(ctxt->fi.cr2); - exc_page_fault(ctxt->regs, error_code); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); - } -} - -static __always_inline bool is_vc2_stack(unsigned long sp) -{ - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) -{ - unsigned long sp, prev_sp; - - sp = (unsigned long)regs; - prev_sp = regs->sp; - - /* - * If the code was already executing on the VC2 stack when the #VC - * happened, let it proceed to the normal handling routine. This way the - * code executing on the VC2 stack can cause #VC exceptions to get handled. - */ - return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); -} - -static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - bool ret = true; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - __sev_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - - return ret; -} - -static __always_inline bool vc_is_db(unsigned long error_code) -{ - return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; -} - -/* - * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode - * and will panic when an error happens. - */ -DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) -{ - irqentry_state_t irq_state; - - /* - * With the current implementation it is always possible to switch to a - * safe stack because #VC exceptions only happen at known places, like - * intercepted instructions or accesses to MMIO areas/IO ports. They can - * also happen with code instrumentation when the hypervisor intercepts - * #DB, but the critical paths are forbidden to be instrumented, so #DB - * exceptions currently also only happen in safe places. - * - * But keep this here in case the noinstr annotations are violated due - * to bug elsewhere. - */ - if (unlikely(vc_from_invalid_context(regs))) { - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); - } - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - exc_debug(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); -} - -/* - * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode - * and will kill the current task with SIGBUS when an error happens. - */ -DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) -{ - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - noist_exc_debug(regs); - return; - } - - irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } - - instrumentation_end(); - irqentry_exit_to_user_mode(regs); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -/* - * Initial set up of SNP relies on information provided by the - * Confidential Computing blob, which can be passed to the kernel - * in the following ways, depending on how it is booted: - * - * - when booted via the boot/decompress kernel: - * - via boot_params - * - * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): - * - via a setup_data entry, as defined by the Linux Boot Protocol - * - * Scan for the blob in that order. - */ -static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - /* Boot kernel would have passed the CC blob via boot_params. */ - if (bp->cc_blob_address) { - cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; - goto found_cc_info; - } - - /* - * If kernel was booted directly, without the use of the - * boot/decompression kernel, the CC blob may have been passed via - * setup_data instead. - */ - cc_info = find_cc_blob_setup_data(bp); - if (!cc_info) - return NULL; - -found_cc_info: - if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); - - return cc_info; -} - -bool __init snp_init(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - if (!bp) - return false; - - cc_info = find_cc_blob(bp); - if (!cc_info) - return false; - - setup_cpuid_table(cc_info); - - /* - * The CC blob will be used later to access the secrets page. Cache - * it here like the boot kernel does. - */ - bp->cc_blob_address = (u32)(unsigned long)cc_info; - - return true; -} - -void __init __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} - -static void dump_cpuid_table(void) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - int i = 0; - - pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", - cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); - - for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { - const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; - - pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", - i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, - fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); - } -} - -/* - * It is useful from an auditing/testing perspective to provide an easy way - * for the guest owner to know that the CPUID table has been initialized as - * expected, but that initialization happens too early in boot to print any - * sort of indicator, and there's not really any other good place to do it, - * so do it here. - */ -static int __init report_cpuid_table(void) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - - if (!cpuid_table->count) - return 0; - - pr_info("Using SNP CPUID table, %d entries present.\n", - cpuid_table->count); - - if (sev_cfg.debug) - dump_cpuid_table(); - - return 0; -} -arch_initcall(report_cpuid_table); - -static int __init init_sev_config(char *str) -{ - char *s; - - while ((s = strsep(&str, ","))) { - if (!strcmp(s, "debug")) { - sev_cfg.debug = true; - continue; - } - - pr_info("SEV command-line option '%s' was not recognized\n", s); - } - - return 1; -} -__setup("sev=", init_sev_config); - -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - rio->exitinfo2 = SEV_RET_NO_FW_CALL; - - /* - * __sev_get_ghcb() needs to run with IRQs disabled because it is using - * a per-CPU GHCB. - */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - if (!ghcb) { - ret = -EIO; - goto e_restore_irq; - } - - vc_ghcb_invalidate(ghcb); - - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { - ghcb_set_rax(ghcb, input->data_gpa); - ghcb_set_rbx(ghcb, input->data_npages); - } - - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); - if (ret) - goto e_put; - - rio->exitinfo2 = ghcb->save.sw_exit_info_2; - switch (rio->exitinfo2) { - case 0: - break; - - case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): - ret = -EAGAIN; - break; - - case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): - /* Number of expected pages are returned in RBX */ - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { - input->data_npages = ghcb_get_rbx(ghcb); - ret = -ENOSPC; - break; - } - fallthrough; - default: - ret = -EIO; - break; - } - -e_put: - __sev_put_ghcb(&state); -e_restore_irq: - local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_GPL(snp_issue_guest_request); - -static struct platform_device sev_guest_device = { - .name = "sev-guest", - .id = -1, -}; - -static int __init snp_init_platform_device(void) -{ - struct sev_guest_platform_data data; - u64 gpa; - - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return -ENODEV; - - gpa = get_secrets_page(); - if (!gpa) - return -ENODEV; - - data.secrets_gpa = gpa; - if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) - return -ENODEV; - - if (platform_device_register(&sev_guest_device)) - return -ENODEV; - - pr_info("SNP guest platform device initialized.\n"); - return 0; -} -device_initcall(snp_init_platform_device); diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S index 3355e27c69eb..1ab65f6c6ae7 100644 --- a/arch/x86/kernel/sev_verify_cbit.S +++ b/arch/x86/kernel/sev_verify_cbit.S @@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit) * The check failed, prevent any forward progress to prevent ROP * attacks, invalidate the stack and go into a hlt loop. */ - xorq %rsp, %rsp + xorl %esp, %esp subq $0x1000, %rsp 2: hlt jmp 2b diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 59e15dd8d0f8..059685612362 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -163,8 +163,8 @@ static int shstk_setup(void) if (features_enabled(ARCH_SHSTK_SHSTK)) return 0; - /* Also not supported for 32 bit and x32 */ - if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + /* Also not supported for 32 bit */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall()) return -EOPNOTSUPP; size = adjust_shstk_size(0); @@ -577,3 +577,19 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) return wrss_control(true); return -EINVAL; } + +int shstk_update_last_frame(unsigned long val) +{ + unsigned long ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + return write_user_shstk_64((u64 __user *)ssp, (u64)val); +} + +bool shstk_is_enabled(void) +{ + return features_enabled(ARCH_SHSTK_SHSTK); +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 65fe2094da59..5f441039b572 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -27,6 +27,7 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> #include <linux/syscalls.h> +#include <linux/rseq.h> #include <asm/processor.h> #include <asm/ucontext.h> @@ -60,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig) } /* + * Enable all pkeys temporarily, so as to ensure that both the current + * execution stack as well as the alternate signal stack are writeable. + * The application can use any of the available pkeys to protect the + * alternate signal stack, and we don't know which one it is, so enable + * all. The PKRU register will be reset to init_pkru later in the flow, + * in fpu__clear_user_states(), and it is the application's responsibility + * to enable the appropriate pkey as the first step in the signal handler + * so that the handler does not segfault. + */ +static inline u32 sig_prepare_pkru(void) +{ + u32 orig_pkru = read_pkru(); + + write_pkru(0); + return orig_pkru; +} + +/* * Set up a signal frame. */ @@ -83,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; + u32 pkru; /* redzone */ if (!ia32_frame) @@ -137,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; } + /* Update PKRU to enable access to the alternate signal stack. */ + pkru = sig_prepare_pkru(); /* save i387 and extended state */ - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { + /* + * Restore PKRU to the original, user-defined value; disable + * extra pkeys enabled for the alternate signal stack, if any. + */ + write_pkru(pkru); return (void __user *)-1L; + } return (void __user *)sp; } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index c12624bc82a3..ef654530bf5a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -34,7 +34,7 @@ #include <asm/gsseg.h> #ifdef CONFIG_IA32_EMULATION -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> static inline void reload_segments(struct sigcontext_32 *sc) { diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 23d8aaf8d9fd..ee9453891901 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -260,13 +260,13 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - if (restore_signal_shadow_stack()) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; - if (restore_altstack(&frame->uc.uc_stack)) + if (restore_signal_shadow_stack()) goto badframe; return regs->ax; @@ -315,6 +315,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -377,6 +380,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 96a771f9f930..18266cc3d98c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -148,14 +148,16 @@ static int register_stop_handler(void) static void native_stop_other_cpus(int wait) { - unsigned int cpu = smp_processor_id(); + unsigned int old_cpu, this_cpu; unsigned long flags, timeout; if (reboot_force) return; /* Only proceed if this is the first CPU to reach this code */ - if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1) + old_cpu = -1; + this_cpu = smp_processor_id(); + if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu)) return; /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */ @@ -186,7 +188,7 @@ static void native_stop_other_cpus(int wait) * NMIs. */ cpumask_copy(&cpus_stop_mask, cpu_online_mask); - cpumask_clear_cpu(cpu, &cpus_stop_mask); + cpumask_clear_cpu(this_cpu, &cpus_stop_mask); if (!cpumask_empty(&cpus_stop_mask)) { apic_send_IPI_allbutself(REBOOT_VECTOR); @@ -210,6 +212,8 @@ static void native_stop_other_cpus(int wait) * CPUs to stop. */ if (!smp_no_nmi_ipi && !register_stop_handler()) { + unsigned int cpu; + pr_emerg("Shutting down cpus with NMI\n"); for_each_cpu(cpu, &cpus_stop_mask) @@ -282,7 +286,7 @@ struct smp_ops smp_ops = { .smp_cpus_done = native_smp_cpus_done, .stop_other_cpus = native_stop_other_cpus, -#if defined(CONFIG_KEXEC_CORE) +#if defined(CONFIG_CRASH_DUMP) .crash_stop_other_cpus = kdump_nmi_shootdown_cpus, #endif .smp_send_reschedule = native_smp_send_reschedule, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2cc2aa120b4b..c10850ae6f09 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,9 +60,11 @@ #include <linux/stackprotector.h> #include <linux/cpuhotplug.h> #include <linux/mc146818rtc.h> +#include <linux/acpi.h> #include <asm/acpi.h> #include <asm/cacheinfo.h> +#include <asm/cpuid.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -101,10 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); EXPORT_PER_CPU_SYMBOL(cpu_die_map); -/* Per CPU bogomips and other parameters */ -DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); -EXPORT_PER_CPU_SYMBOL(cpu_info); - /* CPUs which are the primary SMT threads */ struct cpumask __cpu_primary_thread_mask __read_mostly; @@ -125,25 +123,6 @@ struct mwait_cpu_dead { */ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); -/* Logical package management. */ -struct logical_maps { - u32 phys_pkg_id; - u32 phys_die_id; - u32 logical_pkg_id; - u32 logical_die_id; -}; - -/* Temporary workaround until the full topology mechanics is in place */ -static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = { - .phys_pkg_id = U32_MAX, - .phys_die_id = U32_MAX, -}; - -unsigned int __max_logical_packages __read_mostly; -EXPORT_SYMBOL(__max_logical_packages); -static unsigned int logical_packages __read_mostly; -static unsigned int logical_die __read_mostly; - /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; @@ -269,7 +248,7 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); } - cpu_init_exception_handling(); + cpu_init_exception_handling(false); /* * Load the microcode before reaching the AP alive synchronization @@ -336,106 +315,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/** - * topology_phys_to_logical_pkg - Map a physical package id to a logical - * @phys_pkg: The physical package id to map - * - * Returns logical package id or -1 if not found - */ -int topology_phys_to_logical_pkg(unsigned int phys_pkg) -{ - int cpu; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg) - return per_cpu(logical_maps.logical_pkg_id, cpu); - } - return -1; -} -EXPORT_SYMBOL(topology_phys_to_logical_pkg); - -/** - * topology_phys_to_logical_die - Map a physical die id to logical - * @die_id: The physical die id to map - * @cur_cpu: The CPU for which the mapping is done - * - * Returns logical die id or -1 if not found - */ -static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) -{ - int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id; - - for_each_possible_cpu(cpu) { - if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id && - per_cpu(logical_maps.phys_die_id, cpu) == die_id) - return per_cpu(logical_maps.logical_die_id, cpu); - } - return -1; -} - -/** - * topology_update_package_map - Update the physical to logical package map - * @pkg: The physical package id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_package_map(unsigned int pkg, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_pkg(pkg); - if (new >= 0) - goto found; - - new = logical_packages++; - if (new != pkg) { - pr_info("CPU %u Converting physical %u to logical package %u\n", - cpu, pkg, new); - } -found: - per_cpu(logical_maps.phys_pkg_id, cpu) = pkg; - per_cpu(logical_maps.logical_pkg_id, cpu) = new; - cpu_data(cpu).topo.logical_pkg_id = new; - return 0; -} -/** - * topology_update_die_map - Update the physical to logical die map - * @die: The die id as retrieved via CPUID - * @cpu: The cpu for which this is updated - */ -int topology_update_die_map(unsigned int die, unsigned int cpu) -{ - int new; - - /* Already available somewhere? */ - new = topology_phys_to_logical_die(die, cpu); - if (new >= 0) - goto found; - - new = logical_die++; - if (new != die) { - pr_info("CPU %u Converting physical %u to logical die %u\n", - cpu, die, new); - } -found: - per_cpu(logical_maps.phys_die_id, cpu) = die; - per_cpu(logical_maps.logical_die_id, cpu) = new; - cpu_data(cpu).topo.logical_die_id = new; - return 0; -} - -static void __init smp_store_boot_cpu_info(void) -{ - int id = 0; /* CPU 0 */ - struct cpuinfo_x86 *c = &cpu_data(id); - - *c = boot_cpu_data; - c->cpu_index = id; - topology_update_package_map(c->topo.pkg_id, id); - topology_update_die_map(c->topo.die_id, id); - c->initialized = true; -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -488,6 +367,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) if (c->topo.pkg_id == o->topo.pkg_id && c->topo.die_id == o->topo.die_id && + c->topo.amd_node_id == o->topo.amd_node_id && per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) { if (c->topo.core_id == o->topo.core_id) return topology_sane(c, o, "smt"); @@ -509,10 +389,13 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->topo.pkg_id == o->topo.pkg_id && - c->topo.die_id == o->topo.die_id) - return true; - return false; + if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id) + return false; + + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1) + return c->topo.amd_node_id == o->topo.amd_node_id; + + return true; } static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -557,9 +440,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id intel_cod_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ + X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */ {} }; @@ -600,12 +483,6 @@ static int x86_core_flags(void) return cpu_core_flags() | x86_sched_itmt_flags(); } #endif -#ifdef CONFIG_SCHED_SMT -static int x86_smt_flags(void) -{ - return cpu_smt_flags(); -} -#endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { @@ -613,14 +490,6 @@ static int x86_cluster_flags(void) } #endif -static int x86_die_flags(void) -{ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return x86_sched_itmt_flags(); - - return 0; -} - /* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel @@ -636,7 +505,7 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }; #endif #ifdef CONFIG_SCHED_CLUSTER @@ -656,7 +525,7 @@ static void __init build_sched_topology(void) */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) + cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) }; } @@ -670,8 +539,8 @@ static void __init build_sched_topology(void) void set_cpu_sibling_map(int cpu) { - bool has_smt = smp_num_siblings > 1; - bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + bool has_smt = __max_threads_per_core > 1; + bool has_mp = has_smt || topology_num_cores_per_package() > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; int i, threads; @@ -757,6 +626,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu) { return cpu_l2c_shared_mask(cpu); } +EXPORT_SYMBOL_GPL(cpu_clustergroup_mask); static void impress_friends(void) { @@ -1067,9 +937,13 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || - !apic_id_valid(apicid)) { - pr_err("%s: bad cpu %d\n", __func__, cpu); + if (apicid == BAD_APICID || !apic_id_valid(apicid)) { + pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid); + return -EINVAL; + } + + if (!test_bit(apicid, phys_cpu_present_map)) { + pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid); return -EINVAL; } @@ -1138,54 +1012,41 @@ static __init void disable_smp(void) pr_info("SMP disabled\n"); disable_ioapic_support(); + topology_reset_possible_cpus_up(); - init_cpu_present(cpumask_of(0)); - init_cpu_possible(cpumask_of(0)); - - if (smp_found_config) - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - else - physid_set_mask_of_physid(0, &phys_cpu_present_map); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); cpumask_set_cpu(0, topology_die_cpumask(0)); } -static void __init smp_cpu_index_default(void) -{ - int i; - struct cpuinfo_x86 *c; - - for_each_possible_cpu(i) { - c = &cpu_data(i); - /* mark all to hotplug */ - c->cpu_index = nr_cpu_ids; - } -} - void __init smp_prepare_cpus_common(void) { - unsigned int i; + unsigned int cpu, node; - smp_cpu_index_default(); + /* Mark all except the boot CPU as hotpluggable */ + for_each_possible_cpu(cpu) { + if (cpu) + per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids; + } - /* - * Setup boot CPU information - */ - smp_store_boot_cpu_info(); /* Final full version of the data */ - mb(); + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); + zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node); } set_cpu_sibling_map(0); } +void __init smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + #ifdef CONFIG_X86_64 /* Establish whether parallel bringup can be supported. */ bool __init arch_cpuhp_init_parallel_bringup(void) @@ -1264,102 +1125,16 @@ void __init native_smp_prepare_boot_cpu(void) native_pv_lock_init(); } -void __init calculate_max_logical_packages(void) -{ - int ncpus; - - /* - * Today neither Intel nor AMD support heterogeneous systems so - * extrapolate the boot cpu's data to all packages. - */ - ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); - __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); - pr_info("Max logical packages: %u\n", __max_logical_packages); -} - void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); - calculate_max_logical_packages(); build_sched_topology(); nmi_selftest(); impress_friends(); cache_aps_init(); } -static int __initdata setup_possible_cpus = -1; -static int __init _setup_possible_cpus(char *str) -{ - get_option(&str, &setup_possible_cpus); - return 0; -} -early_param("possible_cpus", _setup_possible_cpus); - - -/* - * cpu_possible_mask should be static, it cannot change as cpu's - * are onlined, or offlined. The reason is per-cpu data-structures - * are allocated by some modules at init time, and don't expect to - * do this dynamically on cpu arrival/departure. - * cpu_present_mask on the other hand can change dynamically. - * In case when cpu_hotplug is not compiled, then we resort to current - * behaviour, which is cpu_possible == cpu_present. - * - Ashok Raj - * - * Three ways to find out the number of additional hotplug CPUs: - * - If the BIOS specified disabled CPUs in ACPI/mptables use that. - * - The user can overwrite it with possible_cpus=NUM - * - Otherwise don't reserve additional CPUs. - * We do this because additional CPUs waste a lot of memory. - * -AK - */ -__init void prefill_possible_map(void) -{ - int i, possible; - - i = setup_max_cpus ?: 1; - if (setup_possible_cpus == -1) { - possible = num_processors; -#ifdef CONFIG_HOTPLUG_CPU - if (setup_max_cpus) - possible += disabled_cpus; -#else - if (possible > i) - possible = i; -#endif - } else - possible = setup_possible_cpus; - - total_cpus = max_t(int, possible, num_processors + disabled_cpus); - - /* nr_cpu_ids could be reduced via nr_cpus= */ - if (possible > nr_cpu_ids) { - pr_warn("%d Processors exceeds NR_CPUS limit of %u\n", - possible, nr_cpu_ids); - possible = nr_cpu_ids; - } - -#ifdef CONFIG_HOTPLUG_CPU - if (!setup_max_cpus) -#endif - if (possible > i) { - pr_warn("%d Processors exceeds max_cpus limit of %u\n", - possible, setup_max_cpus); - possible = i; - } - - set_nr_cpu_ids(possible); - - pr_info("Allowing %d CPUs, %d hotplug CPUs\n", - possible, max_t(int, possible - num_processors, 0)); - - reset_cpu_possible_mask(); - - for (i = 0; i < possible; i++) - set_cpu_possible(i, true); -} - /* correctly size the local cpu masks */ void __init setup_cpu_local_masks(void) { @@ -1502,10 +1277,8 @@ static inline void mwait_play_dead(void) return; if (!this_cpu_has(X86_FEATURE_CLFLUSH)) return; - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) - return; - eax = CPUID_MWAIT_LEAF; + eax = CPUID_LEAF_MWAIT; ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 77a9316da435..9e51242ed125 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,7 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); -#ifdef CONFIG_RETHUNK +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + +#ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, * specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 8e2b2552b5ee..3e2952679b88 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -6,7 +6,9 @@ #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/ptrace.h> + #include <asm/desc.h> +#include <asm/debugreg.h> #include <asm/mmu_context.h> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index c783aeb37dce..776ae6fa7f2d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,6 +18,7 @@ #include <linux/random.h> #include <linux/uaccess.h> #include <linux/elf.h> +#include <linux/hugetlb.h> #include <asm/elf.h> #include <asm/ia32.h> @@ -25,8 +26,10 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. */ -static unsigned long get_align_mask(void) +static unsigned long get_align_mask(struct file *filp) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) return 0; @@ -49,14 +52,7 @@ static unsigned long get_align_mask(void) */ static unsigned long get_align_bits(void) { - return va_align.bits & get_align_mask(); -} - -unsigned long align_vdso_addr(unsigned long addr) -{ - unsigned long align_mask = get_align_mask(); - addr = (addr + align_mask) & ~align_mask; - return addr | get_align_bits(); + return va_align.bits & get_align_mask(NULL); } static int __init control_va_addr_alignment(char *str) @@ -119,13 +115,21 @@ static void find_start_end(unsigned long addr, unsigned long flags, *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); } +static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) +{ + if (vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; unsigned long begin, end; if (flags & MAP_FIXED) @@ -144,28 +148,30 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, return addr; } - info.flags = 0; info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) { + info.align_offset = pgoff << PAGE_SHIFT; + info.start_gap = stack_guard_placement(vm_flags); + } if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } + return vm_unmapped_area(&info); } unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long addr = addr0; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -199,6 +205,10 @@ get_unmapped_area: info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + if (!(filp && is_file_hugepages(filp))) { + info.start_gap = stack_guard_placement(vm_flags); + info.align_offset = pgoff << PAGE_SHIFT; + } /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area @@ -210,10 +220,8 @@ get_unmapped_area: if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } addr = vm_unmapped_area(&info); @@ -228,5 +236,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e42faa792c07..52e1f3f0b361 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -27,25 +27,7 @@ unsigned long profile_pc(struct pt_regs *regs) { - unsigned long pc = instruction_pointer(regs); - - if (!user_mode(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; + return instruction_pointer(regs); } EXPORT_SYMBOL(profile_pc); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c deleted file mode 100644 index 0bab03130033..000000000000 --- a/arch/x86/kernel/topology.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Populate sysfs with topology information - * - * Written by: Matthew Dobson, IBM Corporation - * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - */ -#include <linux/interrupt.h> -#include <linux/nodemask.h> -#include <linux/export.h> -#include <linux/mmzone.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <asm/io_apic.h> -#include <asm/cpu.h> - -static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); - -#ifdef CONFIG_HOTPLUG_CPU -int arch_register_cpu(int cpu) -{ - struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu); - - xc->cpu.hotpluggable = cpu > 0; - return register_cpu(&xc->cpu, cpu); -} -EXPORT_SYMBOL(arch_register_cpu); - -void arch_unregister_cpu(int num) -{ - unregister_cpu(&per_cpu(cpu_devices, num).cpu); -} -EXPORT_SYMBOL(arch_unregister_cpu); -#else /* CONFIG_HOTPLUG_CPU */ - -int __init arch_register_cpu(int num) -{ - return register_cpu(&per_cpu(cpu_devices, num).cpu, num); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -static int __init topology_init(void) -{ - int i; - - for_each_present_cpu(i) - arch_register_cpu(i); - - return 0; -} -subsys_initcall(topology_init); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c876f1d36a81..2dbadf347b5f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include <linux/nmi.h> #include <linux/mm.h> #include <linux/smp.h> +#include <linux/cpu.h> #include <linux/io.h> #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> +#include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -50,6 +52,7 @@ #include <asm/ftrace.h> #include <asm/traps.h> #include <asm/desc.h> +#include <asm/fred.h> #include <asm/fpu/api.h> #include <asm/cpu.h> #include <asm/cpu_entry_area.h> @@ -89,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, get the ModRM byte to pass along to UBSan. + */ +__always_inline int decode_bug(unsigned long addr, u32 *imm) +{ + u8 v; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + v = *(u8 *)(addr++); + if (v != OPCODE_ESCAPE) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) + return BUG_UD2; + + if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + /* Retrieve the immediate (type value) for the UBSAN UD1 */ + v = *(u8 *)(addr++); + if (X86_MODRM_RM(v) == 4) + addr++; + + *imm = 0; + if (X86_MODRM_MOD(v) == 1) + *imm = *(u8 *)addr; + else if (X86_MODRM_MOD(v) == 2) + *imm = *(u32 *)addr; + else + WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); + + return BUG_UD1; +} + + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -214,14 +258,11 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + int ud_type; + u32 imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(regs->ip, &imm); + if (ud_type == BUG_NONE) return handled; /* @@ -229,15 +270,25 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + if (ud_type == BUG_UD2) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); @@ -565,7 +616,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs) */ static bool try_fixup_enqcmd_gp(void) { -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID u32 pasid; /* @@ -591,7 +642,7 @@ static bool try_fixup_enqcmd_gp(void) if (!mm_valid_pasid(current->mm)) return false; - pasid = current->mm->pasid; + pasid = mm_get_enqcmd_pasid(current->mm); /* * Did this thread already have its PASID activated? @@ -772,7 +823,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) */ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { - struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1; + struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1; if (regs != eregs) *regs = *eregs; return regs; @@ -790,7 +841,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r * trust it and switch to the current kernel stack */ if (ip_within_syscall_gap(regs)) { - sp = this_cpu_read(pcpu_hot.top_of_stack); + sp = current_top_of_stack(); goto sync; } @@ -934,8 +985,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6) return false; } -static __always_inline void exc_debug_kernel(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { /* * Disable breakpoints during exception handling; recursive exceptions @@ -947,6 +997,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. + * + * For FRED, nested #DB should just work fine. But when a watchpoint or + * breakpoint is set in the code path which is executed by #DB handler, + * it results in an endless recursion and stack overflow. Thus we stay + * with the IDT approach, i.e., save DR7 and disable #DB. */ unsigned long dr7 = local_db_save(); irqentry_state_t irq_state = irqentry_nmi_enter(regs); @@ -976,7 +1031,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * Catch SYSENTER with TF set and clear DR_STEP. If this hit a * watchpoint at the same time then that will still be handled. */ - if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + if (!cpu_feature_enabled(X86_FEATURE_FRED) && + (dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; /* @@ -1008,8 +1064,7 @@ out: local_db_restore(dr7); } -static __always_inline void exc_debug_user(struct pt_regs *regs, - unsigned long dr6) +static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { bool icebp; @@ -1093,6 +1148,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { exc_debug_user(regs, debug_read_clear_dr6()); } + +#ifdef CONFIG_X86_FRED +/* + * When occurred on different ring level, i.e., from user or kernel + * context, #DB needs to be handled on different stack: User #DB on + * current task stack, while kernel #DB on a dedicated stack. + * + * This is exactly how FRED event delivery invokes an exception + * handler: ring 3 event on level 0 stack, i.e., current task stack; + * ring 0 event on the #DB dedicated stack specified in the + * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception + * entry stub doesn't do stack switch. + */ +DEFINE_FREDENTRY_DEBUG(exc_debug) +{ + /* + * FRED #DB stores DR6 on the stack in the format which + * debug_read_clear_dr6() returns for the IDT entry points. + */ + unsigned long dr6 = fred_event_data(regs); + + if (user_mode(regs)) + exc_debug_user(regs, dr6); + else + exc_debug_kernel(regs, dr6); +} +#endif /* CONFIG_X86_FRED */ + #else /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) @@ -1377,8 +1460,11 @@ void __init trap_init(void) sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ - cpu_init_exception_handling(); + cpu_init_exception_handling(true); + /* Setup traps as cpu_init() might #GP */ - idt_setup_traps(); + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + idt_setup_traps(); + cpu_init(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 15f97c0abc9d..34dec0b72ea8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -16,6 +16,7 @@ #include <linux/static_key.h> #include <linux/static_call.h> +#include <asm/cpuid.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> @@ -26,9 +27,11 @@ #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/i8259.h> +#include <asm/topology.h> #include <asm/uv/uv.h> +#include <asm/sev.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -44,16 +47,16 @@ EXPORT_SYMBOL(tsc_khz); static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; -static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; -static u32 art_to_tsc_numerator; -static u32 art_to_tsc_denominator; -static u64 art_to_tsc_offset; -static struct clocksource *art_related_clocksource; +static struct clocksource_base art_base_clk = { + .id = CSID_X86_ART, +}; +static bool have_art; struct cyc2ns { struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */ @@ -173,10 +176,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n = per_cpu_ptr(&cyc2ns, cpu); - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch(&c2n->seq); c2n->data[1] = data; + write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) @@ -652,7 +656,7 @@ success: } /** - * native_calibrate_tsc + * native_calibrate_tsc - determine TSC frequency * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) @@ -663,13 +667,13 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x15) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; @@ -678,11 +682,11 @@ unsigned long native_calibrate_tsc(void) /* * Denverton SoCs don't report crystal clock, and also don't support - * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal - * clock. + * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz + * crystal clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) + boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* @@ -698,10 +702,10 @@ unsigned long native_calibrate_tsc(void) * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ - if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { unsigned int eax_base_mhz, ebx, ecx, edx; - cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } @@ -713,7 +717,7 @@ unsigned long native_calibrate_tsc(void) * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC @@ -736,12 +740,12 @@ static unsigned long cpu_khz_from_cpuid(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x16) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; - cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } @@ -1065,18 +1069,16 @@ core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ -#define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) - /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { - unsigned int unused[2]; + unsigned int unused; - if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; /* @@ -1089,13 +1091,14 @@ static void __init detect_art(void) tsc_async_resets) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); + cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, + &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); - if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) + art_base_clk.freq_khz /= KHZ; + if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); @@ -1168,6 +1171,7 @@ static struct clocksource clocksource_tsc_early = { .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, + .id = CSID_X86_TSC_EARLY, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1190,6 +1194,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_MUST_VERIFY | CLOCK_SOURCE_VERIFY_PERCPU, + .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, .enable = tsc_cs_enable, .resume = tsc_resume, @@ -1250,15 +1255,12 @@ static void __init check_system_tsc_reliable(void) * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications - * - not more than two sockets. As the number of sockets cannot be - * evaluated at the early boot stage where this has to be - * invoked, check the number of online memory nodes as a - * fallback solution which is an reasonable estimate. + * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && - nr_online_nodes <= 4) + topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } @@ -1287,77 +1289,18 @@ int unsynchronized_tsc(void) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) + if (topology_max_packages() > 1) return 1; } return 0; } -/* - * Convert ART to TSC given numerator/denominator found in detect_art() - */ -struct system_counterval_t convert_art_to_tsc(u64 art) -{ - u64 tmp, res, rem; - - rem = do_div(art, art_to_tsc_denominator); - - res = art * art_to_tsc_numerator; - tmp = rem * art_to_tsc_numerator; - - do_div(tmp, art_to_tsc_denominator); - res += tmp + art_to_tsc_offset; - - return (struct system_counterval_t) {.cs = art_related_clocksource, - .cycles = res}; -} -EXPORT_SYMBOL(convert_art_to_tsc); - -/** - * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. - * @art_ns: ART (Always Running Timer) in unit of nanoseconds - * - * PTM requires all timestamps to be in units of nanoseconds. When user - * software requests a cross-timestamp, this function converts system timestamp - * to TSC. - * - * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set - * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check - * that this flag is set before conversion to TSC is attempted. - * - * Return: - * struct system_counterval_t - system counter value with the pointer to the - * corresponding clocksource - * @cycles: System counter value - * @cs: Clocksource corresponding to system counter value. Used - * by timekeeping code to verify comparability of two cycle - * values. - */ - -struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) -{ - u64 tmp, res, rem; - - rem = do_div(art_ns, USEC_PER_SEC); - - res = art_ns * tsc_khz; - tmp = rem * tsc_khz; - - do_div(tmp, USEC_PER_SEC); - res += tmp; - - return (struct system_counterval_t) { .cs = art_related_clocksource, - .cycles = res}; -} -EXPORT_SYMBOL(convert_art_ns_to_tsc); - - static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** * tsc_refine_calibration_work - Further refine tsc freq calibration - * @work - ignored. + * @work: ignored. * * This functions uses delayed work over a period of a * second to further refine the TSC freq value. Since this is @@ -1454,8 +1397,10 @@ out: if (tsc_unstable) goto unreg; - if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + if (boot_cpu_has(X86_FEATURE_ART)) { + have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); @@ -1480,8 +1425,10 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { - if (boot_cpu_has(X86_FEATURE_ART)) - art_related_clocksource = &clocksource_tsc; + if (boot_cpu_has(X86_FEATURE_ART)) { + have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); @@ -1505,10 +1452,12 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - if (tsc_early_khz) + if (tsc_early_khz) { tsc_khz = tsc_early_khz; - else + } else { tsc_khz = x86_platform.calibrate_tsc(); + clocksource_tsc.freq_khz = tsc_khz; + } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); @@ -1566,6 +1515,9 @@ void __init tsc_early_init(void) /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; + + snp_secure_tsc_init(); + if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 6555a857a1e6..deeb02825670 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 1123ef3ccf90..4334033658ed 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) cur->warned = false; /* - * If a non-zero TSC value for socket 0 may be valid then the default - * adjusted value cannot assumed to be zero either. + * The default adjust value cannot be assumed to be zero on any socket. */ - if (tsc_async_resets) - cur->adjusted = bootval; + cur->adjusted = bootval; /* * Check whether this CPU is the first in a package to come up. In diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index d00c28aaa5be..d4705a348a80 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -723,7 +723,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); - state->signal = (void *)state->ip == ret_from_fork; + state->signal = (void *)state->ip == ret_from_fork_asm; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c07f6daaa22..5a952c5ea66b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -12,6 +12,7 @@ #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/uaccess.h> +#include <linux/syscalls.h> #include <linux/kdebug.h> #include <asm/processor.h> @@ -308,6 +309,122 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool } #ifdef CONFIG_X86_64 + +asm ( + ".pushsection .rodata\n" + ".global uretprobe_trampoline_entry\n" + "uretprobe_trampoline_entry:\n" + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "syscall\n" + ".global uretprobe_syscall_check\n" + "uretprobe_syscall_check:\n" + "popq %r11\n" + "popq %rcx\n" + + /* The uretprobe syscall replaces stored %rax value with final + * return address, so we don't restore %rax in here and just + * call ret. + */ + "retq\n" + ".global uretprobe_trampoline_end\n" + "uretprobe_trampoline_end:\n" + ".popsection\n" +); + +extern u8 uretprobe_trampoline_entry[]; +extern u8 uretprobe_trampoline_end[]; +extern u8 uretprobe_syscall_check[]; + +void *arch_uprobe_trampoline(unsigned long *psize) +{ + static uprobe_opcode_t insn = UPROBE_SWBP_INSN; + struct pt_regs *regs = task_pt_regs(current); + + /* + * At the moment the uretprobe syscall trampoline is supported + * only for native 64-bit process, the compat process still uses + * standard breakpoint. + */ + if (user_64bit_mode(regs)) { + *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry; + return uretprobe_trampoline_entry; + } + + *psize = UPROBE_SWBP_INSN_SIZE; + return &insn; +} + +static unsigned long trampoline_check_ip(void) +{ + unsigned long tramp = uprobe_get_trampoline_vaddr(); + + return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); +} + +SYSCALL_DEFINE0(uretprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + unsigned long err, ip, sp, r11_cx_ax[3]; + + if (regs->ip != trampoline_check_ip()) + goto sigill; + + err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + if (err) + goto sigill; + + /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ + regs->r11 = r11_cx_ax[0]; + regs->cx = r11_cx_ax[1]; + regs->ax = r11_cx_ax[2]; + regs->sp += sizeof(r11_cx_ax); + regs->orig_ax = -1; + + ip = regs->ip; + sp = regs->sp; + + uprobe_handle_trampoline(regs); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + * .. or shadow stack is enabled, in which case we need to skip + * return through the user space stack address. + */ + if (regs->sp != sp || shstk_is_enabled()) + return regs->ax; + regs->sp -= sizeof(r11_cx_ax); + + /* for the case uprobe_consumer has changed r11/cx */ + r11_cx_ax[0] = regs->r11; + r11_cx_ax[1] = regs->cx; + + /* + * ax register is passed through as return value, so we can use + * its space on stack for ip value and jump to it through the + * trampoline's ret instruction + */ + r11_cx_ax[2] = regs->ip; + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + + return regs->ax; + +sigill: + force_sig(SIGILL); + return -1; +} + /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses @@ -1076,8 +1193,13 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!nleft)) + if (likely(!nleft)) { + if (shstk_update_last_frame(trampoline_vaddr)) { + force_sig(SIGSEGV); + return -1; + } return orig_ret_vaddr; + } if (nleft != rasize) { pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e9e803a4d44c..e6cc84143f3e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -246,9 +246,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ if (v.flags & VM86_SCREEN_BITMAP) { - char comm[TASK_COMM_LEN]; - - pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", + current->comm); return -EINVAL; } diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c index 8a89c109e20a..5995a749288a 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/vmcore_info_32.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/pgtable.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c index 7d255f882afe..0dec7d868754 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/vmcore_info_64.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/crash_core.h> +#include <linux/vmcore_info.h> #include <linux/pgtable.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 54a5596adaa6..0deb4887d6e9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -15,11 +15,7 @@ * put it inside the section definition. */ -#ifdef CONFIG_X86_32 -#define LOAD_OFFSET __PAGE_OFFSET -#else #define LOAD_OFFSET __START_KERNEL_map -#endif #define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE @@ -32,6 +28,7 @@ #include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> +#include <asm/kexec.h> #undef i386 /* in case the preprocessor is a 32bit one */ @@ -46,6 +43,7 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; +const_pcpu_hot = pcpu_hot; #if defined(CONFIG_X86_64) /* @@ -98,7 +96,19 @@ jiffies = jiffies_64; #define BSS_DECRYPTED #endif - +#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE) +#define KEXEC_RELOCATE_KERNEL \ + . = ALIGN(0x100); \ + __relocate_kernel_start = .; \ + *(.text..relocate_kernel); \ + *(.data..relocate_kernel); \ + __relocate_kernel_end = .; + +ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE, + "relocate_kernel code too large!") +#else +#define KEXEC_RELOCATE_KERNEL +#endif PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ @@ -113,11 +123,10 @@ PHDRS { SECTIONS { + . = __START_KERNEL; #ifdef CONFIG_X86_32 - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); #else - . = __START_KERNEL; phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); #endif @@ -125,24 +134,11 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; _stext = .; - /* bootstrapping code */ - HEAD_TEXT - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - SOFTIRQENTRY_TEXT -#ifdef CONFIG_RETPOLINE - *(.text..__x86.indirect_thunk) - *(.text..__x86.return_thunk) -#endif - STATIC_CALL_TEXT - ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) ENTRY_TEXT -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO /* * See the comment above srso_alias_untrain_ret()'s * definition. @@ -151,10 +147,26 @@ SECTIONS *(.text..__x86.rethunk_safe) #endif ALIGN_ENTRY_TEXT_END + + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + SOFTIRQENTRY_TEXT +#ifdef CONFIG_MITIGATION_RETPOLINE + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) +#endif + STATIC_CALL_TEXT *(.gnu.warning) } :text = 0xcccccccc + /* bootstrapping code */ + .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { + HEAD_TEXT + } :text = 0xcccccccc + /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -171,6 +183,9 @@ SECTIONS /* init_task */ INIT_TASK_DATA(THREAD_SIZE) + /* equivalent to task_pt_regs(&init_task) */ + __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE; + #ifdef CONFIG_X86_32 /* 32 bit has nosave before _edata */ NOSAVE_DATA @@ -182,6 +197,7 @@ SECTIONS DATA_DATA CONSTRUCTORS + KEXEC_RELOCATE_KERNEL /* rarely changed data like cpu maps */ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) @@ -194,29 +210,6 @@ SECTIONS ORC_UNWIND_TABLE - . = ALIGN(PAGE_SIZE); - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - /* work around gold bug 13023 */ - __vvar_beginning_hack = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = __vvar_beginning_hack + offset; \ - *(.vvar_ ## name) -#include <asm/vvar.h> -#undef EMIT_VVAR - - /* - * Pad the rest of the page with zeros. Otherwise the loader - * can leave garbage here. - */ - . = __vvar_beginning_hack + PAGE_SIZE; - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { @@ -267,20 +260,7 @@ SECTIONS } #endif - /* - * start address and size of operations which during runtime - * can be patched with virtualization friendly instructions or - * baremetal native ones. Think page table operations. - * Details in paravirt_types.h - */ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - -#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_MITIGATION_RETPOLINE /* * List of instructions that call/jmp/jcc to retpoline thunks * __x86_indirect_thunk_*(). These instructions can be patched along @@ -371,6 +351,9 @@ SECTIONS PERCPU_SECTION(INTERNODE_CACHE_BYTES) #endif + RUNTIME_CONST_VARIABLES + RUNTIME_CONST(ptr, USER_PTR_MAX) + . = ALIGN(PAGE_SIZE); /* freed after init ends here */ @@ -454,6 +437,10 @@ SECTIONS STABS_DEBUG DWARF_DEBUG +#ifdef CONFIG_PROPELLER_CLANG + .llvm_bb_addr_map : { *(.llvm_bb_addr_map) } +#endif + ELF_DETAILS DISCARDS @@ -502,6 +489,9 @@ SECTIONS . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +/* needed for Clang - see arch/x86/entry/entry.S */ +PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); + #ifdef CONFIG_X86_64 /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -517,11 +507,11 @@ INIT_PER_CPU(irq_stack_backing_store); "fixed_percpu_data is not at start of per-cpu area"); #endif -#ifdef CONFIG_CPU_UNRET_ENTRY +#ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); #endif -#ifdef CONFIG_CPU_SRSO +#ifdef CONFIG_MITIGATION_SRSO . = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); /* * GNU ld cannot do XOR until 2.41. @@ -539,3 +529,18 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #endif /* CONFIG_X86_64 */ + +/* + * The symbols below are referenced using relative relocations in the + * respective ELF notes. This produces build time constants that the + * linker will never mark as relocatable. (Using just ABSOLUTE() is not + * sufficient for that). + */ +#ifdef CONFIG_XEN_PV +xen_elfnote_entry_value = + ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); +#endif +#ifdef CONFIG_PVH +xen_elfnote_phys32_entry_value = + ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); +#endif diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index d3fc01770558..73511332bb67 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -127,25 +127,12 @@ static void __init vsmp_cap_cpus(void) #endif } -static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb) -{ - return read_apic_id() >> index_msb; -} - -static void vsmp_apic_post_init(void) -{ - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; -} - void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; - x86_platform.apic_post_init = vsmp_apic_post_init; - vsmp_cap_cpus(); set_vsmp_ctl(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a37ebd3b4773..0a2bbd674a6d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -3,10 +3,12 @@ * * For licencing details see kernel-base/COPYING */ +#include <linux/dmi.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/export.h> #include <linux/pci.h> +#include <linux/acpi.h> #include <asm/acpi.h> #include <asm/bios_ebda.h> @@ -66,12 +68,14 @@ struct x86_init_ops x86_init __initdata = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = e820__memory_setup_default, + .dmi_setup = dmi_setup, }, .mpparse = { .setup_ioapic_ids = x86_init_noop, - .find_smp_config = default_find_smp_config, - .get_smp_config = default_get_smp_config, + .find_mptable = mpparse_find_mptable, + .early_parse_smp_cfg = mpparse_parse_early_smp_config, + .parse_smp_cfg = mpparse_parse_smp_config, }, .irqs = { @@ -131,10 +135,12 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; -static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } -static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; } +static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } +static void enc_kexec_begin_noop(void) {} +static void enc_kexec_finish_noop(void) {} static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { @@ -158,6 +164,8 @@ struct x86_platform_ops x86_platform __ro_after_init = { .enc_status_change_finish = enc_status_change_finish_noop, .enc_tlb_flush_required = enc_tlb_flush_required_noop, .enc_cache_flush_required = enc_cache_flush_required_noop, + .enc_kexec_begin = enc_kexec_begin_noop, + .enc_kexec_finish = enc_kexec_finish_noop, }, }; |